Spaces:

adarshnagrikar
/

studio-ai

Runtime error

File size: 18,559 Bytes

d65ca2a
 
 
 
 
 
 
 
 
1ba54e6
 
d65ca2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1ba54e6
 
51246d7
1ba54e6
 
51246d7
 
 
 
 
 
 
 
 
 
 
 
d65ca2a
 
 
 
 
 
 
 
 
51246d7
d65ca2a
 
 
 
 
 
 
 
 
 
 
 
1ba54e6
d65ca2a
1ba54e6
d65ca2a
 
 
 
1ba54e6
d65ca2a
 
 
 
 
 
 
1ba54e6
d65ca2a
 
 
 
 
 
 
 
51246d7
d65ca2a
1ba54e6
51246d7
d65ca2a
 
 
1ba54e6
51246d7
d65ca2a
 
 
1ba54e6
51246d7
d65ca2a
 
 
 
 
51246d7
 
 
 
d65ca2a
 
1ba54e6
 
 
 
 
 
 
d65ca2a
 
 
 
1ba54e6
d65ca2a
 
 
1ba54e6
d65ca2a
 
 
1ba54e6
d65ca2a
1ba54e6
 
 
d65ca2a
1ba54e6
 
 
 
 
 
d65ca2a
 
 
1ba54e6
 
 
 
 
 
 
d65ca2a
 
 
1ba54e6
d65ca2a
 
 
 
 
 
 
 
 
1ba54e6
d65ca2a
1ba54e6
 
 
d65ca2a
 
 
 
 
 
 
1ba54e6
 
d65ca2a
1ba54e6
d65ca2a
1ba54e6
d65ca2a
 
 
 
 
1ba54e6
 
 
 
 
 
 
 
 
 
 
 
d65ca2a
 
 
 
 
 
 
 
 
 
1ba54e6
d65ca2a
 
1ba54e6
 
 
 
 
 
 
d65ca2a
 
1ba54e6
 
d65ca2a
 
 
 
 
 
 
51246d7
d65ca2a
 
 
 
 
1ba54e6
51246d7
d65ca2a
 
 
1ba54e6
51246d7
d65ca2a
 
 
1ba54e6
51246d7
d65ca2a
 
 
1ba54e6
51246d7
d65ca2a
 
 
 
 
51246d7
 
 
 
d65ca2a
 
1ba54e6
 
 
 
 
 
 
 
d65ca2a
1ba54e6
 
 
 
d65ca2a
 
 
 
 
 
1ba54e6
 
d65ca2a
1ba54e6
 
d65ca2a
 
 
 
 
 
 
 
 
 
 
 
1ba54e6
 
d65ca2a
1ba54e6
 
 
 
d65ca2a
 
 
1ba54e6
d65ca2a
1ba54e6
 
 
 
d65ca2a
1ba54e6
 
 
 
 
 
d65ca2a
 
 
1ba54e6
 
 
 
 
 
 
d65ca2a
 
 
 
 
 
 
1ba54e6
d65ca2a
 
 
 
 
 
 
 
 
1ba54e6
d65ca2a
1ba54e6
 
 
d65ca2a
 
 
 
 
 
 
1ba54e6
 
d65ca2a
1ba54e6
d65ca2a
1ba54e6
d65ca2a
 
1ba54e6
d65ca2a
 
1ba54e6
 
 
 
 
 
 
 
 
 
 
 
 
d65ca2a
 
 
 
 
 
 
 
 
 
 
 
 
 
1ba54e6
d65ca2a
 
1ba54e6
 
 
 
 
 
 
d65ca2a
1ba54e6
d65ca2a
 
1ba54e6
d65ca2a
 
 
 
 
1ba54e6
 
d65ca2a
 
 
1ba54e6

import inspect
import math
from typing import Callable, List, Optional, Tuple, Union
from einops import rearrange
import torch
from torch import nn
import torch.nn.functional as F
from torch import Tensor
from diffusers.models.attention_processor import Attention


class LoRALinearLayer(nn.Module):
    def __init__(
        self,
        in_features: int,
        out_features: int,
        rank: int = 4,
        network_alpha: Optional[float] = None,
        device: Optional[Union[torch.device, str]] = None,
        dtype: Optional[torch.dtype] = None,
        cond_width=512,
        cond_height=512,
        number=0,
        n_loras=1
    ):
        super().__init__()
        self.down = nn.Linear(in_features, rank, bias=False)
        self.up = nn.Linear(rank, out_features, bias=False)
        
        # Move layers to specified device and dtype if provided
        if device is not None or dtype is not None:
            # Create a dictionary of kwargs for to() method
            kwargs = {}
            if device is not None:
                kwargs['device'] = device
            if dtype is not None:
                kwargs['dtype'] = dtype
                
            # Apply the to() method with the collected kwargs
            if kwargs:
                self.down = self.down.to(**kwargs)
                self.up = self.up.to(**kwargs)
                
        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
        self.network_alpha = network_alpha
        self.rank = rank
        self.out_features = out_features
        self.in_features = in_features

        nn.init.normal_(self.down.weight, std=1 / rank)
        nn.init.zeros_(self.up.weight)
        
        self.cond_height = cond_height
        self.cond_width = cond_width
        self.number = number
        self.n_loras = n_loras

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        orig_dtype = hidden_states.dtype
        dtype = self.down.weight.dtype

        ####
        batch_size = hidden_states.shape[0]
        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
        block_size = hidden_states.shape[1] - cond_size * self.n_loras
        shape = (batch_size, hidden_states.shape[1], 3072)
        mask = torch.ones(shape, device=hidden_states.device, dtype=dtype)
        mask[:, :block_size+self.number*cond_size, :] = 0
        mask[:, block_size+(self.number+1)*cond_size:, :] = 0
        hidden_states = mask * hidden_states
        ####

        down_hidden_states = self.down(hidden_states.to(dtype))
        up_hidden_states = self.up(down_hidden_states)

        if self.network_alpha is not None:
            up_hidden_states *= self.network_alpha / self.rank

        return up_hidden_states.to(orig_dtype)


class MultiSingleStreamBlockLoraProcessor(nn.Module):
    def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
        super().__init__()
        # Initialize a list to store the LoRA layers
        self.n_loras = n_loras
        self.cond_width = cond_width
        self.cond_height = cond_height
        
        self.q_loras = nn.ModuleList([
            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
                             cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
            for i in range(n_loras)
        ])
        self.k_loras = nn.ModuleList([
            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
                             cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
            for i in range(n_loras)
        ])
        self.v_loras = nn.ModuleList([
            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
                             cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
            for i in range(n_loras)
        ])
        self.lora_weights = lora_weights
        self.bank_attn = None
        self.bank_kv = []
        
        # Move processor to device if specified
        if device is not None:
            self.to(device)

    def __call__(self,
                 attn: Attention,
                 hidden_states: torch.FloatTensor,
                 encoder_hidden_states: torch.FloatTensor = None,
                 attention_mask: Optional[torch.FloatTensor] = None,
                 image_rotary_emb: Optional[torch.Tensor] = None,
                 use_cond=False
                 ) -> torch.FloatTensor:

        batch_size, seq_len, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
        scaled_seq_len = hidden_states.shape[1]
        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
        block_size = scaled_seq_len - cond_size * self.n_loras
        scaled_cond_size = cond_size
        scaled_block_size = block_size

        if len(self.bank_kv) == 0:
            cache = True
        else:
            cache = False

        if cache:
            query = attn.to_q(hidden_states)
            key = attn.to_k(hidden_states)
            value = attn.to_v(hidden_states)
            for i in range(self.n_loras):
                query = query + \
                    self.lora_weights[i] * self.q_loras[i](hidden_states)
                key = key + self.lora_weights[i] * \
                    self.k_loras[i](hidden_states)
                value = value + \
                    self.lora_weights[i] * self.v_loras[i](hidden_states)

            inner_dim = key.shape[-1]
            head_dim = inner_dim // attn.heads

            query = query.view(batch_size, -1, attn.heads,
                               head_dim).transpose(1, 2)
            key = key.view(batch_size, -1, attn.heads,
                           head_dim).transpose(1, 2)
            value = value.view(batch_size, -1, attn.heads,
                               head_dim).transpose(1, 2)

            self.bank_kv.append(key[:, :, scaled_block_size:, :])
            self.bank_kv.append(value[:, :, scaled_block_size:, :])

            if attn.norm_q is not None:
                query = attn.norm_q(query)
            if attn.norm_k is not None:
                key = attn.norm_k(key)

            if image_rotary_emb is not None:
                from diffusers.models.embeddings import apply_rotary_emb
                query = apply_rotary_emb(query, image_rotary_emb)
                key = apply_rotary_emb(key, image_rotary_emb)

            num_cond_blocks = self.n_loras
            mask = torch.ones((scaled_seq_len, scaled_seq_len),
                              device=hidden_states.device)
            mask[:scaled_block_size, :] = 0  # First block_size row
            for i in range(num_cond_blocks):
                start = i * scaled_cond_size + scaled_block_size
                end = (i + 1) * scaled_cond_size + scaled_block_size
                mask[start:end, start:end] = 0  # Diagonal blocks
            mask = mask * -1e10
            mask = mask.to(query.dtype)

            hidden_states = F.scaled_dot_product_attention(
                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
            self.bank_attn = hidden_states[:, :, scaled_block_size:, :]

        else:
            query = attn.to_q(hidden_states)
            key = attn.to_k(hidden_states)
            value = attn.to_v(hidden_states)

            inner_dim = query.shape[-1]
            head_dim = inner_dim // attn.heads

            query = query.view(batch_size, -1, attn.heads,
                               head_dim).transpose(1, 2)
            key = key.view(batch_size, -1, attn.heads,
                           head_dim).transpose(1, 2)
            value = value.view(batch_size, -1, attn.heads,
                               head_dim).transpose(1, 2)

            key = torch.concat(
                [key[:, :, :scaled_block_size, :], self.bank_kv[0]], dim=-2)
            value = torch.concat(
                [value[:, :, :scaled_block_size, :], self.bank_kv[1]], dim=-2)

            if attn.norm_q is not None:
                query = attn.norm_q(query)
            if attn.norm_k is not None:
                key = attn.norm_k(key)

            if image_rotary_emb is not None:
                from diffusers.models.embeddings import apply_rotary_emb
                query = apply_rotary_emb(query, image_rotary_emb)
                key = apply_rotary_emb(key, image_rotary_emb)

            query = query[:, :, :scaled_block_size, :]

            hidden_states = F.scaled_dot_product_attention(
                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
            hidden_states = torch.concat(
                [hidden_states, self.bank_attn], dim=-2)

        hidden_states = hidden_states.transpose(1, 2).reshape(
            batch_size, -1, attn.heads * head_dim)
        hidden_states = hidden_states.to(query.dtype)

        cond_hidden_states = hidden_states[:, block_size:, :]
        hidden_states = hidden_states[:, : block_size, :]

        return hidden_states if not use_cond else (hidden_states, cond_hidden_states)


class MultiDoubleStreamBlockLoraProcessor(nn.Module):
    def __init__(self, dim: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, cond_width=512, cond_height=512, n_loras=1):
        super().__init__()
        
        # Initialize a list to store the LoRA layers
        self.n_loras = n_loras
        self.cond_width = cond_width
        self.cond_height = cond_height
        self.q_loras = nn.ModuleList([
            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
                             cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
            for i in range(n_loras)
        ])
        self.k_loras = nn.ModuleList([
            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
                             cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
            for i in range(n_loras)
        ])
        self.v_loras = nn.ModuleList([
            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
                             cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
            for i in range(n_loras)
        ])
        self.proj_loras = nn.ModuleList([
            LoRALinearLayer(dim, dim, ranks[i], network_alphas[i], device=device, dtype=dtype,
                             cond_width=cond_width, cond_height=cond_height, number=i, n_loras=n_loras)
            for i in range(n_loras)
        ])
        self.lora_weights = lora_weights
        self.bank_attn = None
        self.bank_kv = []
        
        # Move processor to device if specified
        if device is not None:
            self.to(device)

    def __call__(self,
                 attn: Attention,
                 hidden_states: torch.FloatTensor,
                 encoder_hidden_states: torch.FloatTensor = None,
                 attention_mask: Optional[torch.FloatTensor] = None,
                 image_rotary_emb: Optional[torch.Tensor] = None,
                 use_cond=False,
                 ) -> torch.FloatTensor:

        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
        cond_size = self.cond_width // 8 * self.cond_height // 8 * 16 // 64
        block_size = hidden_states.shape[1] - cond_size * self.n_loras
        scaled_seq_len = encoder_hidden_states.shape[1] + \
            hidden_states.shape[1]
        scaled_cond_size = cond_size
        scaled_block_size = scaled_seq_len - scaled_cond_size * self.n_loras

        # `context` projections.
        inner_dim = 3072
        head_dim = inner_dim // attn.heads
        encoder_hidden_states_query_proj = attn.add_q_proj(
            encoder_hidden_states)
        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
        encoder_hidden_states_value_proj = attn.add_v_proj(
            encoder_hidden_states)

        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
            batch_size, -1, attn.heads, head_dim
        ).transpose(1, 2)
        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
            batch_size, -1, attn.heads, head_dim
        ).transpose(1, 2)
        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
            batch_size, -1, attn.heads, head_dim
        ).transpose(1, 2)

        if attn.norm_added_q is not None:
            encoder_hidden_states_query_proj = attn.norm_added_q(
                encoder_hidden_states_query_proj)
        if attn.norm_added_k is not None:
            encoder_hidden_states_key_proj = attn.norm_added_k(
                encoder_hidden_states_key_proj)

        if len(self.bank_kv) == 0:
            cache = True
        else:
            cache = False

        if cache:

            query = attn.to_q(hidden_states)
            key = attn.to_k(hidden_states)
            value = attn.to_v(hidden_states)
            for i in range(self.n_loras):
                query = query + \
                    self.lora_weights[i] * self.q_loras[i](hidden_states)
                key = key + self.lora_weights[i] * \
                    self.k_loras[i](hidden_states)
                value = value + \
                    self.lora_weights[i] * self.v_loras[i](hidden_states)

            inner_dim = key.shape[-1]
            head_dim = inner_dim // attn.heads
            query = query.view(batch_size, -1, attn.heads,
                               head_dim).transpose(1, 2)
            key = key.view(batch_size, -1, attn.heads,
                           head_dim).transpose(1, 2)
            value = value.view(batch_size, -1, attn.heads,
                               head_dim).transpose(1, 2)

            self.bank_kv.append(key[:, :, block_size:, :])
            self.bank_kv.append(value[:, :, block_size:, :])

            if attn.norm_q is not None:
                query = attn.norm_q(query)
            if attn.norm_k is not None:
                key = attn.norm_k(key)

            # attention
            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)

            if image_rotary_emb is not None:
                from diffusers.models.embeddings import apply_rotary_emb
                query = apply_rotary_emb(query, image_rotary_emb)
                key = apply_rotary_emb(key, image_rotary_emb)

            num_cond_blocks = self.n_loras
            mask = torch.ones((scaled_seq_len, scaled_seq_len),
                              device=hidden_states.device)
            mask[:scaled_block_size, :] = 0  # First block_size row
            for i in range(num_cond_blocks):
                start = i * scaled_cond_size + scaled_block_size
                end = (i + 1) * scaled_cond_size + scaled_block_size
                mask[start:end, start:end] = 0  # Diagonal blocks
            mask = mask * -1e10
            mask = mask.to(query.dtype)

            hidden_states = F.scaled_dot_product_attention(
                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
            self.bank_attn = hidden_states[:, :, scaled_block_size:, :]

        else:
            query = attn.to_q(hidden_states)
            key = attn.to_k(hidden_states)
            value = attn.to_v(hidden_states)

            inner_dim = query.shape[-1]
            head_dim = inner_dim // attn.heads

            query = query.view(batch_size, -1, attn.heads,
                               head_dim).transpose(1, 2)
            key = key.view(batch_size, -1, attn.heads,
                           head_dim).transpose(1, 2)
            value = value.view(batch_size, -1, attn.heads,
                               head_dim).transpose(1, 2)

            key = torch.concat(
                [key[:, :, :block_size, :], self.bank_kv[0]], dim=-2)
            value = torch.concat(
                [value[:, :, :block_size, :], self.bank_kv[1]], dim=-2)

            if attn.norm_q is not None:
                query = attn.norm_q(query)
            if attn.norm_k is not None:
                key = attn.norm_k(key)

            # attention
            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)

            if image_rotary_emb is not None:
                from diffusers.models.embeddings import apply_rotary_emb
                query = apply_rotary_emb(query, image_rotary_emb)
                key = apply_rotary_emb(key, image_rotary_emb)

            query = query[:, :, :scaled_block_size, :]

            hidden_states = F.scaled_dot_product_attention(
                query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
            hidden_states = torch.concat(
                [hidden_states, self.bank_attn], dim=-2)

        hidden_states = hidden_states.transpose(1, 2).reshape(
            batch_size, -1, attn.heads * head_dim)
        hidden_states = hidden_states.to(query.dtype)

        encoder_hidden_states, hidden_states = (
            hidden_states[:, : encoder_hidden_states.shape[1]],
            hidden_states[:, encoder_hidden_states.shape[1]:],
        )

        # Linear projection (with LoRA weight applied to each proj layer)
        hidden_states = attn.to_out[0](hidden_states)
        for i in range(self.n_loras):
            hidden_states = hidden_states + \
                self.lora_weights[i] * self.proj_loras[i](hidden_states)
        # dropout
        hidden_states = attn.to_out[1](hidden_states)
        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)

        cond_hidden_states = hidden_states[:, block_size:, :]
        hidden_states = hidden_states[:, :block_size, :]

        return (hidden_states, encoder_hidden_states, cond_hidden_states) if use_cond else (encoder_hidden_states, hidden_states)