Spaces:

adarshnagrikar
/

studio-ai

Runtime error

studio-ai / src /layers_cache.py

adarshnagrikar14

feat: add error handling and prompt enhancement in image generation

51246d7 9 months ago

18.6 kB

	import inspect
	import math
	from typing import Callable, List, Optional, Tuple, Union
	from einops import rearrange
	import torch
	from torch import nn
	import torch.nn.functional as F
	from torch import Tensor
	from diffusers.models.attention_processor import Attention


	class LoRALinearLayer(nn.Module):
	def __init__(
	self,
	in_features: int,
	out_features: int,
	rank: int = 4,
	network_alpha: Optional[float] = None,
	device: Optional[Union[torch.device, str]] = None,
	dtype: Optional[torch.dtype] = None,
	cond_width=512,
	cond_height=512,
	number=0,
	n_loras=1
	):
	super().__init__()
	self.down = nn.Linear(in_features, rank, bias=False)
	self.up = nn.Linear(rank, out_features, bias=False)

	# Move layers to specified device and dtype if provided
	if device is not None or dtype is not None:
	# Create a dictionary of kwargs for to() method
	kwargs = {}
	if device is not None:
	kwargs['device'] = device
	if dtype is not None:
	kwargs['dtype'] = dtype

	# Apply the to() method with the collected kwargs
	if kwargs:
	self.down = self.down.to(**kwargs)
	self.up = self.up.to(**kwargs)

	# This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
	# See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
	self.network_alpha = network_alpha
	self.rank = rank
	self.out_features = out_features
	self.in_features = in_features

	nn.init.normal_(self.down.weight, std=1 / rank)
	nn.init.zeros_(self.up.weight)

	self.cond_height = cond_height
	self.cond_width = cond_width
	self.number = number
	self.n_loras = n_loras

	def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
	orig_dtype = hidden_states.dtype
	dtype = self.down.weight.dtype

	####
	batch_size = hidden_states.shape[0]
	cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
	block_size = hidden_states.shape[1] - cond_size * self.n_loras
	shape = (batch_size, hidden_states.shape[1], 3072)
	mask = torch.ones(shape, device=hidden_states.device, dtype=dtype)
	mask[:, :block_size+self.number*cond_size, :] = 0
	mask[:, block_size+(self.number+1)*cond_size:, :] = 0
	hidden_states = mask * hidden_states
	####

	down_hidden_states = self.down(hidden_states.to(dtype))
	up_hidden_states = self.up(down_hidden_states)

	if self.network_alpha is not None:
	up_hidden_states *= self.network_alpha / self.rank

	return up_hidden_states.to(orig_dtype)


	class MultiSingleStreamBlockLoraProcessor(nn.Module):
	def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
	super().__init__()
	# Initialize a list to store the LoRA layers
	self.n_loras = n_loras
	self.cond_width = cond_width
	self.cond_height = cond_height

	self.q_loras = nn.ModuleList([
	LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
	cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
	for i in range(n_loras)
	])
	self.k_loras = nn.ModuleList([
	LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
	cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
	for i in range(n_loras)
	])
	self.v_loras = nn.ModuleList([
	LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
	cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
	for i in range(n_loras)
	])
	self.lora_weights = lora_weights
	self.bank_attn = None
	self.bank_kv = []

	# Move processor to device if specified
	if device is not None:
	self.to(device)

	def __call__(self,
	attn: Attention,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: torch.FloatTensor = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	image_rotary_emb: Optional[torch.Tensor] = None,
	use_cond=False
	) -> torch.FloatTensor:

	batch_size, seq_len, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
	scaled_seq_len = hidden_states.shape[1]
	cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
	block_size = scaled_seq_len - cond_size * self.n_loras
	scaled_cond_size = cond_size
	scaled_block_size = block_size

	if len(self.bank_kv) == 0:
	cache = True
	else:
	cache = False

	if cache:
	query = attn.to_q(hidden_states)
	key = attn.to_k(hidden_states)
	value = attn.to_v(hidden_states)
	for i in range(self.n_loras):
	query = query + \
	self.lora_weights[i] * self.q_loras[i](hidden_states)
	key = key + self.lora_weights[i] * \
	self.k_loras[i](hidden_states)
	value = value + \
	self.lora_weights[i] * self.v_loras[i](hidden_states)

	inner_dim = key.shape[-1]
	head_dim = inner_dim // attn.heads

	query = query.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)
	key = key.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)

	self.bank_kv.append(key[:, :, scaled_block_size:, :])
	self.bank_kv.append(value[:, :, scaled_block_size:, :])

	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	if image_rotary_emb is not None:
	from diffusers.models.embeddings import apply_rotary_emb
	query = apply_rotary_emb(query, image_rotary_emb)
	key = apply_rotary_emb(key, image_rotary_emb)

	num_cond_blocks = self.n_loras
	mask = torch.ones((scaled_seq_len, scaled_seq_len),
	device=hidden_states.device)
	mask[:scaled_block_size, :] = 0 # First block_size row
	for i in range(num_cond_blocks):
	start = i * scaled_cond_size + scaled_block_size
	end = (i + 1) * scaled_cond_size + scaled_block_size
	mask[start:end, start:end] = 0 # Diagonal blocks
	mask = mask * -1e10
	mask = mask.to(query.dtype)

	hidden_states = F.scaled_dot_product_attention(
	query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
	self.bank_attn = hidden_states[:, :, scaled_block_size:, :]

	else:
	query = attn.to_q(hidden_states)
	key = attn.to_k(hidden_states)
	value = attn.to_v(hidden_states)

	inner_dim = query.shape[-1]
	head_dim = inner_dim // attn.heads

	query = query.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)
	key = key.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)

	key = torch.concat(
	[key[:, :, :scaled_block_size, :], self.bank_kv[0]], dim=-2)
	value = torch.concat(
	[value[:, :, :scaled_block_size, :], self.bank_kv[1]], dim=-2)

	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	if image_rotary_emb is not None:
	from diffusers.models.embeddings import apply_rotary_emb
	query = apply_rotary_emb(query, image_rotary_emb)
	key = apply_rotary_emb(key, image_rotary_emb)

	query = query[:, :, :scaled_block_size, :]

	hidden_states = F.scaled_dot_product_attention(
	query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
	hidden_states = torch.concat(
	[hidden_states, self.bank_attn], dim=-2)

	hidden_states = hidden_states.transpose(1, 2).reshape(
	batch_size, -1, attn.heads * head_dim)
	hidden_states = hidden_states.to(query.dtype)

	cond_hidden_states = hidden_states[:, block_size:, :]
	hidden_states = hidden_states[:, : block_size, :]

	return hidden_states if not use_cond else (hidden_states, cond_hidden_states)


	class MultiDoubleStreamBlockLoraProcessor(nn.Module):
	def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
	super().__init__()

	# Initialize a list to store the LoRA layers
	self.n_loras = n_loras
	self.cond_width = cond_width
	self.cond_height = cond_height
	self.q_loras = nn.ModuleList([
	LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
	cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
	for i in range(n_loras)
	])
	self.k_loras = nn.ModuleList([
	LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
	cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
	for i in range(n_loras)
	])
	self.v_loras = nn.ModuleList([
	LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
	cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
	for i in range(n_loras)
	])
	self.proj_loras = nn.ModuleList([
	LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
	cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
	for i in range(n_loras)
	])
	self.lora_weights = lora_weights
	self.bank_attn = None
	self.bank_kv = []

	# Move processor to device if specified
	if device is not None:
	self.to(device)

	def __call__(self,
	attn: Attention,
	hidden_states: torch.FloatTensor,
	encoder_hidden_states: torch.FloatTensor = None,
	attention_mask: Optional[torch.FloatTensor] = None,
	image_rotary_emb: Optional[torch.Tensor] = None,
	use_cond=False,
	) -> torch.FloatTensor:

	batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
	cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
	block_size = hidden_states.shape[1] - cond_size * self.n_loras
	scaled_seq_len = encoder_hidden_states.shape[1] + \
	hidden_states.shape[1]
	scaled_cond_size = cond_size
	scaled_block_size = scaled_seq_len - scaled_cond_size * self.n_loras

	# `context` projections.
	inner_dim = 3072
	head_dim = inner_dim // attn.heads
	encoder_hidden_states_query_proj = attn.add_q_proj(
	encoder_hidden_states)
	encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
	encoder_hidden_states_value_proj = attn.add_v_proj(
	encoder_hidden_states)

	encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
	batch_size, -1, attn.heads, head_dim
	).transpose(1, 2)
	encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
	batch_size, -1, attn.heads, head_dim
	).transpose(1, 2)
	encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
	batch_size, -1, attn.heads, head_dim
	).transpose(1, 2)

	if attn.norm_added_q is not None:
	encoder_hidden_states_query_proj = attn.norm_added_q(
	encoder_hidden_states_query_proj)
	if attn.norm_added_k is not None:
	encoder_hidden_states_key_proj = attn.norm_added_k(
	encoder_hidden_states_key_proj)

	if len(self.bank_kv) == 0:
	cache = True
	else:
	cache = False

	if cache:

	query = attn.to_q(hidden_states)
	key = attn.to_k(hidden_states)
	value = attn.to_v(hidden_states)
	for i in range(self.n_loras):
	query = query + \
	self.lora_weights[i] * self.q_loras[i](hidden_states)
	key = key + self.lora_weights[i] * \
	self.k_loras[i](hidden_states)
	value = value + \
	self.lora_weights[i] * self.v_loras[i](hidden_states)

	inner_dim = key.shape[-1]
	head_dim = inner_dim // attn.heads
	query = query.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)
	key = key.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)

	self.bank_kv.append(key[:, :, block_size:, :])
	self.bank_kv.append(value[:, :, block_size:, :])

	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	# attention
	query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
	key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
	value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)

	if image_rotary_emb is not None:
	from diffusers.models.embeddings import apply_rotary_emb
	query = apply_rotary_emb(query, image_rotary_emb)
	key = apply_rotary_emb(key, image_rotary_emb)

	num_cond_blocks = self.n_loras
	mask = torch.ones((scaled_seq_len, scaled_seq_len),
	device=hidden_states.device)
	mask[:scaled_block_size, :] = 0 # First block_size row
	for i in range(num_cond_blocks):
	start = i * scaled_cond_size + scaled_block_size
	end = (i + 1) * scaled_cond_size + scaled_block_size
	mask[start:end, start:end] = 0 # Diagonal blocks
	mask = mask * -1e10
	mask = mask.to(query.dtype)

	hidden_states = F.scaled_dot_product_attention(
	query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
	self.bank_attn = hidden_states[:, :, scaled_block_size:, :]

	else:
	query = attn.to_q(hidden_states)
	key = attn.to_k(hidden_states)
	value = attn.to_v(hidden_states)

	inner_dim = query.shape[-1]
	head_dim = inner_dim // attn.heads

	query = query.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)
	key = key.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)
	value = value.view(batch_size, -1, attn.heads,
	head_dim).transpose(1, 2)

	key = torch.concat(
	[key[:, :, :block_size, :], self.bank_kv[0]], dim=-2)
	value = torch.concat(
	[value[:, :, :block_size, :], self.bank_kv[1]], dim=-2)

	if attn.norm_q is not None:
	query = attn.norm_q(query)
	if attn.norm_k is not None:
	key = attn.norm_k(key)

	# attention
	query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
	key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
	value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)

	if image_rotary_emb is not None:
	from diffusers.models.embeddings import apply_rotary_emb
	query = apply_rotary_emb(query, image_rotary_emb)
	key = apply_rotary_emb(key, image_rotary_emb)

	query = query[:, :, :scaled_block_size, :]

	hidden_states = F.scaled_dot_product_attention(
	query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
	hidden_states = torch.concat(
	[hidden_states, self.bank_attn], dim=-2)

	hidden_states = hidden_states.transpose(1, 2).reshape(
	batch_size, -1, attn.heads * head_dim)
	hidden_states = hidden_states.to(query.dtype)

	encoder_hidden_states, hidden_states = (
	hidden_states[:, : encoder_hidden_states.shape[1]],
	hidden_states[:, encoder_hidden_states.shape[1]:],
	)

	# Linear projection (with LoRA weight applied to each proj layer)
	hidden_states = attn.to_out[0](hidden_states)
	for i in range(self.n_loras):
	hidden_states = hidden_states + \
	self.lora_weights[i] * self.proj_loras[i](hidden_states)
	# dropout
	hidden_states = attn.to_out[1](hidden_states)
	encoder_hidden_states = attn.to_add_out(encoder_hidden_states)

	cond_hidden_states = hidden_states[:, block_size:, :]
	hidden_states = hidden_states[:, :block_size, :]

	return (hidden_states, encoder_hidden_states, cond_hidden_states) if use_cond else (encoder_hidden_states, hidden_states)