fac / fc /sd_webui_masactrl_orig /scripts /masactrl_controller.py

Upload fc using SD-Hub extension

50261d7 verified almost 2 years ago

34.7 kB


	from __future__ import annotations

	import enum
	from inspect import isfunction

	# from diffusers.utils import deprecate
	from ldm.modules.diffusionmodules.openaimodel import UNetModel
	import torch
	from ldm.util import default
	from modules.hypernetworks import hypernetwork
	from modules import shared, devices
	from modules.sd_hijack_optimizations import get_available_vram
	from torch import nn, einsum
	import torch.nn.functional as F
	from einops import rearrange, repeat
	import os
	import math
	import numpy as np


	_ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32")

	def exists(val):
	return val is not None


	def uniq(arr):
	return{el: True for el in arr}.keys()


	def default(val, d):
	if exists(val):
	return val
	return d() if isfunction(d) else d


	def max_neg_value(t):
	return -torch.finfo(t.dtype).max


	def init_(tensor):
	dim = tensor.shape[-1]
	std = 1 / math.sqrt(dim)
	tensor.uniform_(-std, std)
	return tensor

	class ProxyReconMasaSattn(object):
	def __init__(self, controller: MasaController, module_key: str, org_module: torch.nn.Module = None):
	super().__init__()
	self.org_module = org_module
	self.org_forward = None

	self.attached = False
	self.controller = controller
	self.module_key = module_key



	def __getattr__(self, attr):
	if attr not in ['org_module', 'org_forward', 'attached', 'controller', 'module_key'] and self.attached:
	return getattr(self.org_module, attr)




	def attach(self):
	if self.org_forward is not None:
	return
	self.org_forward = self.org_module.forward
	self.org_module.forward = self.forward
	self.attached = True

	def detach(self):
	if self.org_forward is None:
	return
	self.org_module.forward = self.org_forward
	self.org_forward = None
	self.attached = False

	# implementation from diffusers
	def prepare_attention_mask(self, attention_mask, target_length, batch_size=None, out_dim=3):
	if batch_size is None:
	# deprecate(
	# "batch_size=None",
	# "0.0.15",
	# (
	# "Not passing the `batch_size` parameter to `prepare_attention_mask` can lead to incorrect"
	# " attention mask preparation and is deprecated behavior. Please make sure to pass `batch_size` to"
	# " `prepare_attention_mask` when preparing the attention_mask."
	# ),
	# )
	batch_size = 1

	head_size = self.heads
	if attention_mask is None:
	return attention_mask

	if attention_mask.shape[-1] != target_length:
	if attention_mask.device.type == "mps":
	# HACK: MPS: Does not support padding by greater than dimension of input tensor.
	# Instead, we can manually construct the padding tensor.
	padding_shape = (attention_mask.shape[0], attention_mask.shape[1], target_length)
	padding = torch.zeros(padding_shape, dtype=attention_mask.dtype, device=attention_mask.device)
	attention_mask = torch.cat([attention_mask, padding], dim=2)
	else:
	attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)

	if out_dim == 3:
	if attention_mask.shape[0] < batch_size * head_size:
	attention_mask = attention_mask.repeat_interleave(head_size, dim=0)
	elif out_dim == 4:
	attention_mask = attention_mask.unsqueeze(1)
	attention_mask = attention_mask.repeat_interleave(head_size, dim=1)

	return attention_mask

	def forward(self, x, context=None, mask=None):

	with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=True):
	masa_active = self.controller.query_masa_active(self.module_key)
	if masa_active:
	batch_size, sequence_length, inner_dim = x.shape
	masa_mask, masa_kv, masa_mask_threshold = self.controller.retrieve_masa_info_suite(self.module_key)
	masa_kv = {
	key: value.cuda() for key, value in masa_kv.items()
	}
	# interpolate and convert to binary mask
	# scale_factor = np.sqrt(sequence_length / masa_mask.shape[-1] / masa_mask.shape[-2])
	# scaled_mask_shape = (int(masa_mask.shape[-2] * scale_factor), int(masa_mask.shape[-1] * scale_factor))

	# resize from latent size instead of mask size
	scale_factor = math.ceil((np.sqrt(self.controller.current_latent_size[0] * self.controller.current_latent_size[1] / sequence_length)))
	scaled_mask_shape = (math.ceil((self.controller.current_latent_size[0] / scale_factor)), math.ceil(self.controller.current_latent_size[1] / scale_factor))

	scaled_mask = F.interpolate(masa_mask.unsqueeze(0).unsqueeze(0),
	(scaled_mask_shape[0], scaled_mask_shape[1])).flatten()

	# # this is original implementation for reference, behavior for fg_mask is not ideal
	# scaled_mask[scaled_mask >= masa_mask_threshold] = 1
	# scaled_mask[scaled_mask < masa_mask_threshold] = 0
	# fg_mask = scaled_mask.masked_fill(scaled_mask == 0, -float('inf'))
	# bg_mask = scaled_mask.masked_fill(scaled_mask == 1, -float('inf'))

	fg_attn_mask = torch.zeros_like(scaled_mask)
	fg_attn_mask[scaled_mask < masa_mask_threshold] = torch.finfo(masa_kv['k_in'].dtype).min

	bg_attn_mask = torch.zeros_like(scaled_mask)
	bg_attn_mask[scaled_mask >= masa_mask_threshold] = torch.finfo(masa_kv['k_in'].dtype).min

	if sequence_length > 20000:
	fg_sattn_out = self.masa_split_sattn_forward(x, context, fg_attn_mask,
	masa_kv['k_in'], masa_kv['v_in'])
	bg_sattn_out = self.masa_split_sattn_forward(x, context, bg_attn_mask,
	masa_kv['k_in'], masa_kv['v_in'])
	else:
	fg_sattn_out = self.masa_scaled_dot_product_attention_forward(x, context, fg_attn_mask, masa_kv['k_in'], masa_kv['v_in'])
	bg_sattn_out = self.masa_scaled_dot_product_attention_forward(x, context, bg_attn_mask, masa_kv['k_in'], masa_kv['v_in'])

	fg_sattn_out = fg_sattn_out.cuda()

	fg_binary_mask = torch.ones_like(scaled_mask)
	fg_binary_mask[scaled_mask < masa_mask_threshold] = 0

	masa_sattn_out = fg_sattn_out * fg_binary_mask.unsqueeze(-1) + bg_sattn_out * (1 - fg_binary_mask.unsqueeze(-1))

	del fg_attn_mask, bg_attn_mask, fg_sattn_out, bg_sattn_out, fg_binary_mask, scaled_mask, masa_mask, masa_kv, masa_mask_threshold
	return masa_sattn_out
	else:
	return self.masa_scaled_dot_product_attention_forward(x, context, mask)
	def masa_split_sattn_forward(self, x, context=None, mask=None, external_k_in=None, external_v_in=None):
	batch_size, sequence_length, inner_dim = x.shape
	h = self.heads
	head_dim = inner_dim // h

	# mask_view = mask.view(1,sequence_length,1)

	q_in = self.to_q(x)
	context = default(context, x)

	context_k, context_v = hypernetwork.apply_hypernetworks(shared.loaded_hypernetworks, context)
	k_in = self.to_k(context_k)
	v_in = self.to_v(context_v)

	sattn_data_suite = {'k_in': k_in, 'v_in': v_in}
	self.controller.report_sattn(self.module_key, sattn_data_suite)
	del k_in, v_in

	k_in = external_k_in
	v_in = external_v_in

	dtype = q_in.dtype
	if shared.opts.upcast_attn:
	q_in, k_in, v_in = q_in.float(), k_in.float(), v_in if v_in.device.type == 'mps' else v_in.float()

	with devices.without_autocast(disable=not shared.opts.upcast_attn):
	k_in = k_in * self.scale

	del context, x

	q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q_in, k_in, v_in))
	del q_in, k_in, v_in

	r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)

	mem_free_total = get_available_vram()

	gb = 1024 ** 3
	tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * q.element_size()
	modifier = 3 if q.element_size() == 2 else 2.5
	mem_required = tensor_size * modifier
	steps = 1

	if mem_required > mem_free_total:
	steps = 2 ** (math.ceil(math.log(mem_required / mem_free_total, 2)))
	# print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
	# f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}")

	if steps > 64:
	max_res = math.floor(math.sqrt(math.sqrt(mem_free_total / 2.5)) / 8) * 64
	raise RuntimeError(f'Not enough memory, use lower resolution (max approx. {max_res}x{max_res}). '
	f'Need: {mem_required / 64 / gb:0.1f}GB free, Have:{mem_free_total / gb:0.1f}GB free')
	slice_size = q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
	for i in range(0, q.shape[1], slice_size):
	end = i + slice_size
	s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k)

	cur_mask = mask[i:end]
	current_masked_view = cur_mask.view(1, -1,1)
	s1 = s1 + current_masked_view
	s2 = s1.softmax(dim=-1, dtype=q.dtype)
	del s1


	r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
	del s2
	del q, k, v

	r1 = r1.to(dtype)

	r2 = rearrange(r1, '(b h) n d -> b n (h d)', h=h)
	del r1

	return self.to_out(r2)


	def masa_scaled_dot_product_attention_forward(self, x, context=None, mask=None, external_k_in=None, external_v_in=None):
	batch_size, sequence_length, inner_dim = x.shape
	h = self.heads
	head_dim = inner_dim // h

	if mask is not None:
	mask = self.prepare_attention_mask(mask, sequence_length, batch_size)
	if len(mask.shape) == 1 and mask.shape[0] == sequence_length:
	# we are getting a slice of the mask covering sequence_length, need to repeat in all other dimensions
	mask = mask.unsqueeze(-1).repeat(batch_size, h, 1, sequence_length)
	else:
	mask = mask.view(batch_size, self.heads, -1, mask.shape[-1])


	q_in = self.to_q(x)

	if mask is not None:
	mask = mask.to(q_in.dtype)

	if external_k_in is None or external_v_in is None:
	context = default(context, x)
	context_k, context_v = hypernetwork.apply_hypernetworks(shared.loaded_hypernetworks, context)
	k_in = self.to_k(context_k)
	v_in = self.to_v(context_v)
	if self.controller.log_recon:
	sattn_data_suite = {'k_in': k_in, 'v_in': v_in}
	self.controller.report_sattn(self.module_key, sattn_data_suite)
	else:
	# be aware that hypernetworks will have no effect
	k_in = external_k_in
	v_in = external_v_in
	if self.controller.log_recon:
	context = default(context, x)
	context_k, context_v = hypernetwork.apply_hypernetworks(shared.loaded_hypernetworks, context)
	k_report = self.to_k(context_k)
	v_report = self.to_v(context_v)
	sattn_data_suite = {'k_in': k_report, 'v_in': v_report}
	self.controller.report_sattn(self.module_key, sattn_data_suite)
	del k_report, v_report




	q = q_in.view(batch_size, -1, h, head_dim).transpose(1, 2)
	k = k_in.view(batch_size, -1, h, head_dim).transpose(1, 2)
	v = v_in.view(batch_size, -1, h, head_dim).transpose(1, 2)

	del q_in, k_in, v_in

	dtype = q.dtype
	if shared.opts.upcast_attn:
	q, k, v = q.float(), k.float(), v.float()

	# the output of sdp = (batch, num_heads, seq_len, head_dim)
	hidden_states = torch.nn.functional.scaled_dot_product_attention(
	q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False
	)

	hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, h * head_dim)
	hidden_states = hidden_states.to(dtype)

	# linear proj
	hidden_states = self.to_out[0](hidden_states)
	# dropout
	hidden_states = self.to_out[1](hidden_states)

	del q, k, v
	return hidden_states


	class ProxyLoggedCrossAttn(object):
	def __init__(self, controller: MasaController, module_key: str, org_module: torch.nn.Module = None, is_xattn=False):
	super().__init__()
	self.org_module = org_module
	self.org_forward = None

	self.attached = False
	self.controller = controller
	self.module_key = module_key
	self.is_xattn = is_xattn


	def __getattr__(self, attr):
	if attr not in ['org_module', 'org_forward', 'attached', 'controller', 'module_key'] and self.attached:
	return getattr(self.org_module, attr)




	def attach(self):
	if self.org_forward is not None:
	return
	self.org_forward = self.org_module.forward
	self.org_module.forward = self.forward
	self.attached = True

	def detach(self):
	if self.org_forward is None:
	return
	self.org_module.forward = self.org_forward
	self.org_forward = None
	self.attached = False


	def forward(self, x, context=None, mask=None):
	if not self.is_xattn:

	output = self.scaled_dot_product_sattn_log_forward(x, context, mask)


	return output
	else:
	return self.split_xattn_log_forward(x, context, mask)




	def scaled_dot_product_sattn_log_forward(self, x, context=None, mask=None):
	batch_size, sequence_length, inner_dim = x.shape
	h = self.heads
	head_dim = inner_dim // h

	if mask is not None:
	mask = self.prepare_attention_mask(mask, sequence_length, batch_size)
	if len(mask.shape) == 1 and mask.shape[0] == sequence_length:
	# we are getting a slice of the mask covering sequence_length, need to repeat in all other dimensions
	mask = mask.unsqueeze(-1).repeat(batch_size, h, 1, sequence_length)
	else:
	mask = mask.view(batch_size, self.heads, -1, mask.shape[-1])


	q_in = self.to_q(x)

	if mask is not None:
	mask = mask.to(q_in.dtype)


	context = default(context, x)
	context_k, context_v = hypernetwork.apply_hypernetworks(shared.loaded_hypernetworks, context)
	k_in = self.to_k(context_k)
	v_in = self.to_v(context_v)

	sattn_data_suite = {'k_in': k_in, 'v_in': v_in}
	self.controller.report_sattn(self.module_key, sattn_data_suite)





	q = q_in.view(batch_size, -1, h, head_dim).transpose(1, 2)
	k = k_in.view(batch_size, -1, h, head_dim).transpose(1, 2)
	v = v_in.view(batch_size, -1, h, head_dim).transpose(1, 2)

	del q_in, k_in, v_in

	dtype = q.dtype
	if shared.opts.upcast_attn:
	q, k, v = q.float(), k.float(), v.float()

	# the output of sdp = (batch, num_heads, seq_len, head_dim)
	hidden_states = torch.nn.functional.scaled_dot_product_attention(
	q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False
	)

	hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, h * head_dim)
	hidden_states = hidden_states.to(dtype)

	# linear proj
	hidden_states = self.to_out[0](hidden_states)
	# dropout
	hidden_states = self.to_out[1](hidden_states)
	return hidden_states

	def split_xattn_log_forward(self, x, context=None, mask=None):
	h = self.heads

	q_in = self.to_q(x)
	context = default(context, x)

	context_k, context_v = hypernetwork.apply_hypernetworks(shared.loaded_hypernetworks, context)
	k_in = self.to_k(context_k)
	v_in = self.to_v(context_v)




	dtype = q_in.dtype
	if shared.opts.upcast_attn:
	q_in, k_in, v_in = q_in.float(), k_in.float(), v_in if v_in.device.type == 'mps' else v_in.float()

	with devices.without_autocast(disable=not shared.opts.upcast_attn):
	k_in = k_in * self.scale

	del context, x

	q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q_in, k_in, v_in))
	del q_in, k_in, v_in

	r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)

	mem_free_total = get_available_vram()

	gb = 1024 ** 3
	tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * q.element_size()
	modifier = 3 if q.element_size() == 2 else 2.5
	mem_required = tensor_size * modifier
	steps = 1

	if mem_required > mem_free_total:
	steps = 2 ** (math.ceil(math.log(mem_required / mem_free_total, 2)))
	# print(f"Expected tensor size:{tensor_size/gb:0.1f}GB, cuda free:{mem_free_cuda/gb:0.1f}GB "
	# f"torch free:{mem_free_torch/gb:0.1f} total:{mem_free_total/gb:0.1f} steps:{steps}")

	if steps > 64:
	max_res = math.floor(math.sqrt(math.sqrt(mem_free_total / 2.5)) / 8) * 64
	raise RuntimeError(f'Not enough memory, use lower resolution (max approx. {max_res}x{max_res}). '
	f'Need: {mem_required / 64 / gb:0.1f}GB free, Have:{mem_free_total / gb:0.1f}GB free')


	foreground_ids = self.controller.foreground_indexes
	xattn_report_sim = torch.zeros(q.shape[0], q.shape[1], len(foreground_ids), device=q.device, dtype=q.dtype)
	slice_size = q.shape[1] // steps if (q.shape[1] % steps) == 0 else q.shape[1]
	for i in range(0, q.shape[1], slice_size):
	end = i + slice_size
	s1 = einsum('b i d, b j d -> b i j', q[:, i:end], k)

	s2 = s1.softmax(dim=-1, dtype=q.dtype)
	del s1


	for id_idx, id in enumerate(foreground_ids):
	xattn_report_sim[:, i:end, id_idx] = s2[:, i:end, id]


	r1[:, i:end] = einsum('b i j, b j d -> b i d', s2, v)
	del s2

	xattn_data_suite = {'sim': xattn_report_sim}
	self.controller.report_xattn(self.module_key, xattn_data_suite)
	del q, k, v

	r1 = r1.to(dtype)

	r2 = rearrange(r1, '(b h) n d -> b n (h d)', h=h)
	del r1

	return self.to_out(r2)

	# oom for 1728 x 944
	# def forward(self, x, context=None, mask=None):
	# h = self.heads
	#
	# q = self.to_q(x)
	# context = default(context, x)
	# k = self.to_k(context)
	# v = self.to_v(context)
	#
	# if not self.is_xattn:
	# sattn_data_suite = {'k_in': k, 'v_in': v}
	# self.controller.report_sattn(self.module_key, sattn_data_suite)
	#
	# q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
	#
	# # force cast to fp32 to avoid overflowing
	# # if _ATTN_PRECISION == "fp32":
	# # with torch.autocast(enabled=False, device_type='cuda'):
	# # q, k = q.float(), k.float()
	# # sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
	# # else:
	# sim = einsum('b i d, b j d -> b i j', q, k) * self.scale
	#
	# if self.is_xattn:
	# del q, k
	#
	# if exists(mask):
	# mask = rearrange(mask, 'b ... -> b (...)')
	# max_neg_value = -torch.finfo(sim.dtype).max
	# mask = repeat(mask, 'b j -> (b h) () j', h=h)
	# sim.masked_fill_(~mask, max_neg_value)
	#
	# # attention, what we cannot get enough of
	# sim = sim.softmax(dim=-1)
	#
	# if self.is_xattn:
	# xattn_data_suite = {'sim': sim}
	# self.controller.report_xattn(self.module_key, xattn_data_suite)
	#
	#
	#
	#
	# out = einsum('b i j, b j d -> b i d', sim, v)
	# out = rearrange(out, '(b h) n d -> b n (h d)', h=h)
	# return self.to_out(out)

	# def scaled_dot_product_attention_forward(self, x, context=None, mask=None):
	# batch_size, sequence_length, inner_dim = x.shape
	#
	# if mask is not None:
	# mask = self.prepare_attention_mask(mask, sequence_length, batch_size)
	# mask = mask.view(batch_size, self.heads, -1, mask.shape[-1])
	#
	# h = self.heads
	# q_in = self.to_q(x)
	# context = default(context, x)
	#
	# context_k, context_v = hypernetwork.apply_hypernetworks(shared.loaded_hypernetworks, context)
	# k_in = self.to_k(context_k)
	# v_in = self.to_v(context_v)
	#
	# q_t, k_t, v_t = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q_in, k_in, v_in))
	# with torch.autocast(enabled=False, device_type='cuda'):
	# q_t, k_t = q_t.float(), k_t.float()
	# sim = einsum('b i d, b j d -> b i j', q_t, k_t) * self.scale
	#
	# head_dim = inner_dim // h
	# q = q_in.view(batch_size, -1, h, head_dim).transpose(1, 2)
	# k = k_in.view(batch_size, -1, h, head_dim).transpose(1, 2)
	# v = v_in.view(batch_size, -1, h, head_dim).transpose(1, 2)
	#
	#
	#
	# del q_in, k_in, v_in
	#
	# dtype = q.dtype
	# if shared.opts.upcast_attn:
	# q, k, v = q.float(), k.float(), v.float()
	#
	#
	#
	# # the output of sdp = (batch, num_heads, seq_len, head_dim)
	# hidden_states = torch.nn.functional.scaled_dot_product_attention(
	# q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False
	# )
	#
	#
	#
	# hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, h * head_dim)
	#
	#
	#
	# if self.is_xattn:
	# xattn_report_data_dict = {'v': v, 'hidden_states': hidden_states}
	# self.controller.report_xattn(self.module_key, xattn_report_data_dict)
	# else:
	# sattn_report_data_dict = {'k': k, 'v': v}
	# self.controller.report_sattn(self.module_key, sattn_report_data_dict)
	#
	# # Compute the transpose of 'v'
	# v_transpose = torch.transpose(v_t, 1, 2)
	#
	# # Calculate the product of 'v' and its transpose
	# vvT = torch.matmul(v_t, v_transpose)
	#
	# # Compute the pseudo-inverse of the 'vvT'
	# inv_vvT = torch.inverse(vvT.cpu().to(torch.float32))
	#
	# hidden_states_re = rearrange(hidden_states, 'b n (h d) -> (b h) n d', h=h)
	# # Calculate the product of 'out' and the pseudo-inverse
	# # sim_recovered = torch.matmul(hidden_states_re.cpu().to(torch.float32), inv_vvT)
	# sim_recovered = torch.einsum('ikj,ilk->ikl', hidden_states_re.cpu().to(torch.float32), inv_vvT)
	#
	# # calculate loss between recovered sim and original sim
	# loss = torch.nn.functional.mse_loss(sim_recovered, sim.cpu())
	#
	#
	# hidden_states = hidden_states.to(dtype)
	#
	# # linear proj
	# hidden_states = self.to_out[0](hidden_states)
	#
	#
	#
	#
	#
	#
	#
	# # dropout
	# hidden_states = self.to_out[1](hidden_states)
	# return hidden_states


	class ProxyMasaUNetModel(object):
	def __init__(self, controller:MasaController, org_module: torch.nn.Module = None):
	super().__init__()
	self.org_module = org_module
	self.org_forward = None
	self.attached = False
	self.controller = controller




	def __getattr__(self, attr):
	if attr not in ['org_module', 'org_forward', 'attached', 'controller'] and self.attached:
	return getattr(self.org_module, attr)

	def attach(self):
	if self.org_forward is not None:
	return
	self.org_forward = self.org_module.forward
	self.org_module.forward = self.forward
	self.attached = True

	def detach(self):
	if self.org_forward is None:
	return
	self.org_module.forward = self.org_forward
	self.org_forward = None
	self.attached = False

	def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
	self.controller.masa_unet_signal(x, timesteps)
	return self.org_forward(x, timesteps=timesteps, context=context, y=y, **kwargs)

	aggregate_xattn_map_selected_module_keys = ['input_blocks.7.1.transformer_blocks.0.attn2', 'input_blocks.8.1.transformer_blocks.0.attn2', 'output_blocks.3.1.transformer_blocks.0.attn2', 'output_blocks.4.1.transformer_blocks.0.attn2', 'output_blocks.5.1.transformer_blocks.0.attn2']

	class MasaControllerMode(enum.IntEnum):
	LOGGING = 0
	RECON = 1
	LOGRECON = 2
	IDLE = 3


	class MasaController:
	def __init__(self, ori_unet: UNetModel):
	self.monitoring_xattn_modules = {}
	self.monitoring_sattn_modules = {}
	self.logged_xattn_map_data_suite = {}
	self.logged_sattn_data_suite = {}
	self.proxy_xattn_modules = {}
	self.proxy_sattn_modules = {}
	self.proxy_recon_sattn_mmodules = {}

	self.recording_mode = True
	self.current_timestep: float = -1.0
	self.current_latent_size = (0,0)
	self.unet_proxy = ProxyMasaUNetModel(self, ori_unet)
	self.recon_averaged_xattn_map_reference = {}
	self.mode = MasaControllerMode.LOGGING
	self.start_timestep = 900.0
	self.start_layer = 10
	self.recon_mask_threshold = 0.1
	for name, module in ori_unet.named_modules():
	module_name = type(module).__name__
	if module_name == "CrossAttention":
	if 'attn2' in name:
	self.proxy_xattn_modules[name] = ProxyLoggedCrossAttn(self, name, module, True)

	elif 'attn1' in name:
	self.proxy_sattn_modules[name] = ProxyLoggedCrossAttn(self, name, module)
	self.proxy_recon_sattn_mmodules[name] = ProxyReconMasaSattn(self, name, module)

	self.log_recon = False
	self.recon_logged_sattn_kv_suite = {}
	self.foreground_indexes = [1]
	self.current_timestep_unet_pass = 0






	def logging_attach_all(self):
	for name, module in self.proxy_xattn_modules.items():
	module.attach()
	for name, module in self.proxy_sattn_modules.items():
	module.attach()
	self.unet_proxy.attach()

	def logging_detach_all(self):
	for name, module in self.proxy_xattn_modules.items():
	module.detach()
	for name, module in self.proxy_sattn_modules.items():
	module.detach()
	self.unet_proxy.detach()

	def logging_attach_xattn(self):
	for name, module in self.proxy_xattn_modules.items():
	if name in aggregate_xattn_map_selected_module_keys:
	module.attach()

	def logging_detach_xattn(self):
	for name, module in self.proxy_xattn_modules.items():
	module.detach()

	def logging_attach_sattn(self):
	for name, module in self.proxy_sattn_modules.items():
	module.attach()

	def logging_detach_sattn(self):
	for name, module in self.proxy_sattn_modules.items():
	module.detach()







	def report_xattn(self, name, xattn_map_data_dict):
	timestep_str_key = str(self.current_timestep)
	if self.current_timestep_unet_pass == 0:

	self.logged_xattn_map_data_suite[timestep_str_key][name] = xattn_map_data_dict
	# else:
	# print('debug for unmatched uncond pass')

	def report_sattn(self, name, sattn_map_data_dict):
	timestep_str_key = str(self.current_timestep)
	# if name not in self.logged_sattn_data_suite[timestep_str_key][self.current_timestep_unet_pass]:
	# pass
	# else:
	# print('debug for sattn report overwrite')

	# have to save VRAM
	sattn_map_data_dict_cpu = {
	key: value.cpu() for key, value in sattn_map_data_dict.items()
	}
	self.logged_sattn_data_suite[timestep_str_key][self.current_timestep_unet_pass][name] = sattn_map_data_dict_cpu
	del sattn_map_data_dict




	def recon_attach_sattn(self):
	layer_idx = 0
	for name, module in self.proxy_recon_sattn_mmodules.items():
	layer_idx += 1
	if layer_idx < self.start_layer:
	continue
	module.attach()


	def recon_detach_all(self):

	for name, module in self.proxy_recon_sattn_mmodules.items():
	module.detach()
	self.unet_proxy.detach()

	def retrieve_sattn_mask(self, name):
	return self.recon_averaged_xattn_map_reference[self.current_timestep]

	def query_masa_active(self, name):
	return self.current_timestep <= self.start_timestep

	def retrieve_masa_info_suite(self, key):
	current_mask = self.recon_averaged_xattn_map_reference[str(self.current_timestep)]
	current_kv = self.recon_logged_sattn_kv_suite[str(self.current_timestep)][self.current_timestep_unet_pass][key]
	return current_mask, current_kv, self.recon_mask_threshold


	def masa_unet_signal(self, x, timesteps):
	last_timestep = self.current_timestep
	current_timestep = timesteps[0].item()
	if last_timestep == current_timestep:
	self.current_timestep_unet_pass += 1
	else:
	self.current_timestep_unet_pass = 0
	self.current_timestep = current_timestep

	timestep_str_key = str(self.current_timestep)
	self.current_latent_size = x.shape[-2:]
	if self.mode == MasaControllerMode.LOGGING or self.mode == MasaControllerMode.LOGRECON:
	if timestep_str_key not in self.logged_xattn_map_data_suite:
	self.logged_xattn_map_data_suite[timestep_str_key] = {}
	if timestep_str_key not in self.logged_sattn_data_suite:
	self.logged_sattn_data_suite[timestep_str_key] = {}
	if self.current_timestep_unet_pass not in self.logged_sattn_data_suite[timestep_str_key]:
	self.logged_sattn_data_suite[timestep_str_key][self.current_timestep_unet_pass] = {}


	def calculate_reconstruction_maps(self):
	if self.logged_xattn_map_data_suite:
	print('Calculating mask from logged xattn maps...')
	reconstruction_xattn_timestep_map_dict = {}
	for timestep_str_key in self.logged_xattn_map_data_suite.keys():

	xattn_maps_of_interest = [v['sim'] for v in self.logged_xattn_map_data_suite[timestep_str_key].values()]
	for i in range(len(xattn_maps_of_interest)):
	attn_map = xattn_maps_of_interest[i]
	# aggregate along token dim
	attn_map = attn_map.sum(-1)
	# only interested in cond map
	if attn_map.shape[0] > 8:
	# cond uncond same pass
	attn_map, _ = attn_map.chunk(2, dim=0) # (head_count,N)
	# mean along head dim
	attn_map = attn_map.mean(0)
	# xattn_maps_of_interest[i] = attn_map
	res_h, res_w = self.current_latent_size
	xattn_maps_of_interest[i] = attn_map.reshape(math.ceil(res_h/4), math.ceil(res_w/4))

	attn_maps_aggregate = torch.stack(xattn_maps_of_interest, dim=0).mean(0)

	maps_min = attn_maps_aggregate.min()
	maps_max = attn_maps_aggregate.max()
	final_map = (attn_maps_aggregate - maps_min) / (maps_max - maps_min)
	reconstruction_xattn_timestep_map_dict[timestep_str_key] = final_map

	print(f'Processed timestep {timestep_str_key}...')

	self.recon_averaged_xattn_map_reference = reconstruction_xattn_timestep_map_dict
	del self.logged_xattn_map_data_suite
	self.logged_xattn_map_data_suite = {}
	self.recon_logged_sattn_kv_suite = self.logged_sattn_data_suite
	self.logged_sattn_data_suite = {}
	def mode_init(self, mode:MasaControllerMode, masa_start_step=5, masa_start_layer=10, mask_threshold=0.1, foreground_indexes=[1]):
	self.current_timestep = -1
	self.mode = mode
	match mode:
	case MasaControllerMode.LOGGING:
	self.logging_attach_xattn()
	self.logging_attach_sattn()

	case MasaControllerMode.RECON \| MasaControllerMode.LOGRECON:
	if mode == MasaControllerMode.LOGRECON:
	self.log_recon = True
	self.logging_attach_xattn()
	else:
	self.log_recon = False

	# order matters because of start_layer

	self.recon_params_init(masa_start_step, masa_start_layer, mask_threshold)
	self.recon_attach_sattn()
	if mode is not MasaControllerMode.IDLE:
	self.foreground_indexes = foreground_indexes

	self.unet_proxy.attach()

	def recon_params_init(self, masa_start_step, masa_start_layer,mask_threshold):
	self.start_timestep = float(list(self.recon_averaged_xattn_map_reference.keys())[masa_start_step])
	self.start_layer = masa_start_layer
	self.recon_mask_threshold = mask_threshold



	def mode_end(self, mode:MasaControllerMode, foreground_indexes=None):
	match mode:
	case MasaControllerMode.LOGGING:
	self.logging_detach_all()
	self.calculate_reconstruction_maps()
	case MasaControllerMode.RECON:
	self.recon_detach_all()
	case MasaControllerMode.LOGRECON:
	self.recon_detach_all()
	self.logging_detach_xattn()