Spaces:

omeregev
/

click2mask

Sleeping

App Files Files Community

click2mask / scripts /dyn_mask.py

omeregev

Initial commit

6df18f5 9 months ago

raw

history blame contribute delete

16.8 kB

	import torch
	from torchvision import transforms
	import numpy as np
	import skfmm
	from PIL import Image
	import torch.nn as nn
	import cv2
	import scipy
	from scipy.ndimage.filters import gaussian_filter
	import kornia
	import warnings
	warnings.filterwarnings("ignore", message="PyTorch version 1.7.1 or higher is recommended")
	import alpha_clip
	from augmentations import ImageAugmentations
	from constants import Const, N


	@torch.no_grad()
	def get_dist_field(dist_from, device, as_squeezed_np=False):
	if not isinstance(dist_from, np.ndarray):
	dist_from = dist_from.cpu().numpy()
	assert np.max(dist_from) <= 1
	dist_from = -(np.where(dist_from, 0, -1) + 0.5)
	dist_field = skfmm.distance(dist_from, dx=1)
	if as_squeezed_np:
	return dist_field
	return torch.tensor(dist_field).to(device)


	def get_surround(surround_from, surround_width, device, as_squeezed_np=False):
	dists = get_dist_field(surround_from, device)
	surround = (dists <= surround_width).to(surround_from.dtype)
	if as_squeezed_np:
	return surround.cpu().numpy()
	return surround


	class DynMask:

	def __init__(self, click_pil, args, init_image_tensor, device, total_steps):
	self.args = args
	self.device = device
	self.init_image = init_image_tensor
	self.total_steps = total_steps

	self.ac_size = (self.args.alpha_clip_scale, self.args.alpha_clip_scale)
	if self.args.alpha_clip_scale == 336:
	self.ac_model, self.ac_preprocess = alpha_clip.load(
	"ViT-L/14@336px",
	alpha_vision_ckpt_pth="./checkpoints/clip_l14_336_grit1m_fultune_8xe.pth",
	device=self.device,
	)
	else:
	self.ac_model, self.ac_preprocess = alpha_clip.load(
	"ViT-L/14",
	alpha_vision_ckpt_pth="./checkpoints/clip_l14_grit20m_fultune_2xe.pth",
	device=self.device,
	)

	self.image_augmentations = ImageAugmentations(
	self.args.alpha_clip_scale, Const.AUG_NUM
	)
	self.text_features = self.get_text_features([self.args.prompt])

	self.latent_size = Const.LATENT_SIZE
	self.decoded_size = (Const.H, Const.W)
	self.thresh_val = Const.THRESH_VAL
	self.base_potential = None
	self.potential = None
	self.latent_mask = None
	self.set_init_masks(click_pil)

	self.cached_masks_clones = {}
	self.closs_hist = {}
	self.latents_hist = {}
	self.latent_masks_hist = {}

	@torch.no_grad()
	def normalize_point_size(self, click, radius_for64=1.367):
	threshed = (click > 0.5).astype(float)
	x, y = np.where(threshed)
	center = int(x.mean().round()), int(y.mean().round())
	norm_threshed = np.zeros_like(threshed)
	norm_threshed[center[0], center[1]] = 1
	norm_threshed = get_surround(
	torch.tensor(norm_threshed).to(self.device),
	click.shape[0] / 64 * radius_for64 - 0.3,
	self.device,
	as_squeezed_np=True,
	)

	return norm_threshed

	@torch.no_grad()
	def calc_potential(self, click_pil, sigma_for_shape64):
	dest_size = self.latent_size
	click = click_pil.convert("L").resize(dest_size, Image.NEAREST)
	click = (np.array(click) > 125).astype(float)
	click = self.normalize_point_size(
	click, radius_for64=Const.POINT_ON_LATENT_RADIUS
	)
	potential = gaussian_filter(
	click, sigma=sigma_for_shape64 * (click.shape[0]) / 64
	)
	potential = (potential - np.min(potential)) / max(
	np.max(potential) - np.min(potential), 1e-8
	)
	potential = potential[np.newaxis, np.newaxis, ...]
	potential = torch.from_numpy(potential).half().to(self.device)

	return potential

	@torch.no_grad()
	def set_init_masks(self, click_pil, stretch_factor=1.0):
	potential = self.calc_potential(
	click_pil, sigma_for_shape64=Const.SIGMA_FOR_SHAPE64
	)
	self.base_potential = potential.detach().to(torch.float64)
	if self.base_potential.ndim == 2:
	self.base_potential = self.base_potential.unsqueeze(0).unsqueeze(0)
	self.base_potential = self.base_potential * (Const.POTENTIAL_PEAK - (-1)) - 1
	self.base_potential = stretch_factor * self.base_potential

	self.set_cur_masks(step_i=0)

	@torch.no_grad()
	def set_cur_masks(
	self, step_i, grads_to_update=None, surround_ring=None, return_only=None
	):
	potential = self.base_potential + self.get_bias(step_i)

	if grads_to_update is not None:
	potential = potential + (surround_ring * Const.MASK_LR * grads_to_update)
	potential = transforms.GaussianBlur(
	Const.GAUSS_K_MASK, sigma=Const.GAUSS_SIGMA_MASK
	)(potential)

	if torch.all(potential <= 0):
	potential += Const.ADDITION_IN_COLLAPSE
	print(
	f"{'' 10} Mask shrunk entirely, added {Const.ADDITION_IN_COLLAPSE}"
	)
	elif torch.all(potential >= 0):
	potential -= Const.ADDITION_IN_COLLAPSE
	print(
	f"{'' 10} Mask expanded entirely, reduced {Const.ADDITION_IN_COLLAPSE}"
	)

	self.potential = potential.half()
	self.latent_mask = self.get_threshed_mask(self.potential)

	return self.get_curr_masks(return_only=return_only)

	@torch.no_grad()
	def get_curr_masks(self, return_only=None):
	if return_only is not None:
	if return_only == N.POTENTIAL:
	return self.potential
	elif return_only == N.LATENT_MASK:
	return self.latent_mask
	else:
	raise ValueError(f"return_only should be in ('{N.POTENTIAL}', '{N.LATENT_MASK}')")

	return self.potential, self.latent_mask

	@torch.no_grad()
	def make_cached_masks_clones(self, name):
	self.cached_masks_clones[name] = {
	N.POTENTIAL: self.potential.detach().clone(),
	N.LATENT_MASK: self.latent_mask.detach().clone(),
	}

	@torch.no_grad()
	def set_masks_from_cached_masks_clones(self, name):
	self.potential = self.cached_masks_clones[name][N.POTENTIAL]
	self.latent_mask = self.cached_masks_clones[name][N.LATENT_MASK]

	@torch.no_grad()
	def evolve_mask(
	self, step_i, decoder, latent_pred_z0, source_latents, return_only=None
	):

	potential, latent_mask = self.get_curr_masks()
	surround_ring = self.get_ring(latent_mask)
	grads_latent = self.calc_grads(
	latent_pred_z0=latent_pred_z0,
	source_latents=source_latents,
	potential=potential,
	step_i=step_i,
	decoder=decoder,
	)
	grads_latent = torch.abs(grads_latent)
	grads_latent = transforms.GaussianBlur(
	Const.GAUSS_K_GRADS, sigma=Const.GAUSS_SIGMA_GRADS
	)(grads_latent)

	grads_latent = (grads_latent - grads_latent.mean()) / max(
	grads_latent.std(), 1e-6
	)
	grads_latent = torch.maximum(grads_latent, torch.tensor(0.0).to(self.device))

	self.set_cur_masks(
	step_i=step_i, grads_to_update=grads_latent, surround_ring=surround_ring
	)

	return self.get_curr_masks(return_only=return_only)

	def calc_grads(self, latent_pred_z0, source_latents, potential, step_i, decoder):
	with torch.enable_grad():
	latent_mask = self.get_threshed_mask(potential)
	latent_mask = latent_mask.detach().requires_grad_()

	blend_predz0_origz0 = latent_pred_z0 * latent_mask + (
	source_latents * (1 - latent_mask)
	)

	scaled_blend_pred_z0_origz0 = 1 / 0.18215 * blend_predz0_origz0
	decoded_blend_predz0_origz0 = decoder(
	scaled_blend_pred_z0_origz0
	).sample.to(torch.float32)

	alpha_mask = transforms.Resize(self.decoded_size, interpolation=0)(
	latent_mask
	)
	alpha_mask = (alpha_mask > 0.5).half().clone().detach()
	alpha_mask = get_surround(
	alpha_mask,
	Const.ALPHA_MASK_DILATION_ON_512 * (Const.HW / 512.0),
	self.device,
	)

	alpha_loss = self.alpha_clip_loss(
	decoded_blend_predz0_origz0,
	alpha_mask,
	self.text_features,
	self.image_augmentations,
	augs_with_orig=True,
	)

	self.closs_hist[
	step_i - 1
	] = alpha_loss.detach() # The mask used for the loss is prev step mask

	grads_latent = torch.autograd.grad(alpha_loss, latent_mask)[0].to(
	torch.float64
	)

	return grads_latent.detach()

	def alpha_clip_loss(
	self,
	image,
	mask,
	text_features,
	image_augmentations,
	augs_with_orig=True,
	return_as_similarity=False,
	):
	"""
	image and mask in range 0.0 to 1.0
	"""
	assert mask.min() >= 0 and mask.max() <= 1

	mask_transform = transforms.Compose(
	[nn.AdaptiveAvgPool2d(self.ac_size), transforms.Normalize(0.5, 0.26)]
	)
	mask_normalize = transforms.Normalize(0.5, 0.26)

	image_transform = transforms.Compose(
	[
	transforms.Resize(self.ac_size, interpolation=Image.BICUBIC),
	transforms.Normalize(
	(0.48145466, 0.4578275, 0.40821073),
	(0.26862954, 0.26130258, 0.27577711),
	),
	]
	)
	image_normalize = transforms.Normalize(
	(0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)
	)

	image = image.add(1).div(2)
	if image.ndim == 3:
	image = image.unsqueeze(0)

	alpha = mask
	if alpha.ndim == 3:
	alpha = alpha.unsqueeze(dim=0)

	if image_augmentations is not None:
	image, alpha = image_augmentations(image, alpha, with_orig=augs_with_orig)
	image = image_normalize(image).half()
	alpha = mask_normalize(alpha).half()
	else:
	image = image_transform(image).half()
	alpha = mask_transform(alpha).half()

	image_features = self.ac_model.visual(image, alpha)
	image_features = image_features / image_features.norm(dim=-1, keepdim=True)

	if return_as_similarity:
	alpha_loss = image_features @ text_features.T
	else:
	alpha_loss = 1 - image_features @ text_features.T
	alpha_loss = alpha_loss.mean(dim=0)

	return alpha_loss

	def get_text_features(self, prompt):
	assert type(prompt) in (list, tuple)
	text = alpha_clip.tokenize(prompt).to(self.device)
	text_features = self.ac_model.encode_text(text)
	text_features = text_features / text_features.norm(dim=-1, keepdim=True)
	return text_features

	@torch.no_grad()
	def get_bias(self, step_i):
	bias = Const.BIAS_DILATION_VAL * (Const.BIAS_DILATION_DEC_FACTOR**step_i)
	while torch.all(self.base_potential + bias > 0) and bias > 1e-8:
	bias *= 0.9

	return bias

	def get_threshed_mask(self, potential):
	thresh_val = self.thresh_val
	t_m = (potential > thresh_val).half()

	t_m = t_m.cpu().numpy().squeeze().astype(np.uint8)
	t_m = scipy.ndimage.binary_fill_holes(t_m)
	t_m = torch.tensor(t_m).to(self.device).unsqueeze(0).unsqueeze(0).half()
	t_m = self.close_gaps_with_connection(
	t_m, thickness=Const.CLOSE_GAPS_WITH_CONNECTION_THICKNESS
	)

	t_m = kornia.morphology.closing(
	t_m, torch.ones(Const.CLOSING_K, Const.CLOSING_K).to(self.device)
	)
	t_m = t_m.cpu().numpy().squeeze().astype(np.uint8)
	t_m = scipy.ndimage.binary_fill_holes(t_m)
	t_m = torch.tensor(t_m).to(self.device).unsqueeze(0).unsqueeze(0).half()

	t_m = transforms.GaussianBlur(
	Const.GAUSS_K_THRESHED, sigma=Const.GAUSS_SIGMA_THRESHED
	)(t_m)
	t_m = (t_m > Const.THRESH_POST_GAUSS).half()

	return t_m

	@torch.no_grad()
	def close_gaps_with_connection(self, threshed_mask, thickness):
	# also cleans small contours
	given_threshed_mask = threshed_mask
	threshed_mask = threshed_mask.cpu().numpy().squeeze().astype(np.uint8)

	connected_mask = threshed_mask * 0
	contours, hierarchy = cv2.findContours(
	threshed_mask, cv2.RETR_LIST, cv2.CHAIN_APPROX_NONE
	)
	if len(contours) == 1:
	return given_threshed_mask

	contours = sorted(contours, key=lambda x: cv2.contourArea(x), reverse=True)
	contours = [
	cnt
	for cnt in contours
	if cv2.contourArea(cnt)
	> threshed_mask.shape[-1] * threshed_mask.shape[-2] * 0.001
	]

	cv2.drawContours(connected_mask, contours, 0, 255, -1)
	for i in range(1, len(contours)):
	cv2.drawContours(connected_mask, contours, i, 255, -1)
	hull = cv2.convexHull(contours[i]) # Convex hull of contour
	hull = cv2.approxPolyDP(hull, 0.1 * cv2.arcLength(hull, True), True)
	connect = hull.copy()
	for hp in hull:
	dists = np.linalg.norm(contours[0] - hp, axis=2).squeeze()
	min_points = np.where(dists == dists.min())[0]
	for mp in min_points:
	connect = np.append(
	connect, np.expand_dims(contours[0][mp], axis=0), axis=0
	)
	connected_mask = cv2.drawContours(
	connected_mask, [connect], -1, color=255, thickness=thickness
	)
	connected_mask = cv2.drawContours(
	connected_mask, [connect], -1, color=255, thickness=-1
	)

	connected_mask = (
	((torch.tensor(connected_mask).to(self.device)) > 125)
	.unsqueeze(0)
	.unsqueeze(0)
	.half()
	)
	return connected_mask

	@torch.no_grad()
	def get_plain_dilated_latent_mask(
	self,
	last_step_latent_mask,
	step_i,
	total_steps,
	max_area_ratio_for_dilation=None,
	rerun_dyn_start_step_i=None,
	):
	max_area_ratio_for_dilation = (
	Const.MAX_AREA_RATIO_FOR_DILATION
	if max_area_ratio_for_dilation is None
	else max_area_ratio_for_dilation
	)
	if (
	last_step_latent_mask.sum()
	> max_area_ratio_for_dilation * last_step_latent_mask.nelement()
	):
	return last_step_latent_mask

	first_k = self.latent_size[-1] // 2
	while (
	get_surround(last_step_latent_mask, first_k, self.device).sum()
	> 0.75 * self.latent_size[-1] ** 2
	):
	first_k -= 1
	if rerun_dyn_start_step_i:
	plain_dilation_ws = np.linspace(
	first_k, 0, rerun_dyn_start_step_i + 2 - Const.RERUN_STOP_DILATION
	).round()
	plain_dilation_ws = np.pad(
	plain_dilation_ws, (0, total_steps - len(plain_dilation_ws))
	)
	else:
	plain_dilation_ws = np.array(
	[first_k / max(1, (i / 3)) for i in range(0, total_steps)]
	).round()
	plain_dilation_ws[-10:] = 0

	return get_surround(
	last_step_latent_mask, plain_dilation_ws[step_i], self.device
	).half()

	@torch.no_grad()
	def get_ring(self, latent_mask):
	assert (latent_mask.min() >= 0) and (latent_mask.max() <= 1)
	out_ring_width = Const.OUT_RING_WIDTH
	in_on_ring_width = Const.IN_ON_RING_WIDTH
	latent_mask = (latent_mask.cpu().numpy() >= 0.5).astype(np.float16)
	dists = get_dist_field(latent_mask, self.device, as_squeezed_np=True)

	in_ring_width = in_on_ring_width - 1
	in_ring = dists.copy()
	in_ring[in_ring > -1] = 0
	in_ring[in_ring <= -in_ring_width - 1] = 0
	in_ring[in_ring != 0] = 1

	on_ring = latent_mask.copy()
	on_ring[dists < -1] = 0

	in_on_ring = in_ring.astype(bool) \| on_ring.astype(bool)

	out_ring = dists.copy()
	out_ring[out_ring <= 0] = 0
	out_ring[out_ring > out_ring_width] = 0
	out_ring[out_ring != 0] = 1

	surround_ring = in_on_ring.astype(np.uint8) \| out_ring.astype(np.uint8)
	surround_ring = torch.tensor(surround_ring).to(self.device)

	return surround_ring