| | from typing import Any, Literal, Union |
| |
|
| | import cv2 |
| | import numpy as np |
| | import torch |
| | from einops import rearrange |
| | from PIL import Image |
| |
|
| | from invokeai.backend.image_util.util import nms, normalize_image_channel_count |
| |
|
| | CONTROLNET_RESIZE_VALUES = Literal[ |
| | "just_resize", |
| | "crop_resize", |
| | "fill_resize", |
| | "just_resize_simple", |
| | ] |
| | CONTROLNET_MODE_VALUES = Literal["balanced", "more_prompt", "more_control", "unbalanced"] |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | lvmin_kernels_raw = [ |
| | np.array([[-1, -1, -1], [0, 1, 0], [1, 1, 1]], dtype=np.int32), |
| | np.array([[0, -1, -1], [1, 1, -1], [0, 1, 0]], dtype=np.int32), |
| | ] |
| |
|
| | lvmin_kernels = [] |
| | lvmin_kernels += [np.rot90(x, k=0, axes=(0, 1)) for x in lvmin_kernels_raw] |
| | lvmin_kernels += [np.rot90(x, k=1, axes=(0, 1)) for x in lvmin_kernels_raw] |
| | lvmin_kernels += [np.rot90(x, k=2, axes=(0, 1)) for x in lvmin_kernels_raw] |
| | lvmin_kernels += [np.rot90(x, k=3, axes=(0, 1)) for x in lvmin_kernels_raw] |
| |
|
| | lvmin_prunings_raw = [ |
| | np.array([[-1, -1, -1], [-1, 1, -1], [0, 0, -1]], dtype=np.int32), |
| | np.array([[-1, -1, -1], [-1, 1, -1], [-1, 0, 0]], dtype=np.int32), |
| | ] |
| |
|
| | lvmin_prunings = [] |
| | lvmin_prunings += [np.rot90(x, k=0, axes=(0, 1)) for x in lvmin_prunings_raw] |
| | lvmin_prunings += [np.rot90(x, k=1, axes=(0, 1)) for x in lvmin_prunings_raw] |
| | lvmin_prunings += [np.rot90(x, k=2, axes=(0, 1)) for x in lvmin_prunings_raw] |
| | lvmin_prunings += [np.rot90(x, k=3, axes=(0, 1)) for x in lvmin_prunings_raw] |
| |
|
| |
|
| | def remove_pattern(x, kernel): |
| | objects = cv2.morphologyEx(x, cv2.MORPH_HITMISS, kernel) |
| | objects = np.where(objects > 127) |
| | x[objects] = 0 |
| | return x, objects[0].shape[0] > 0 |
| |
|
| |
|
| | def thin_one_time(x, kernels): |
| | y = x |
| | is_done = True |
| | for k in kernels: |
| | y, has_update = remove_pattern(y, k) |
| | if has_update: |
| | is_done = False |
| | return y, is_done |
| |
|
| |
|
| | def lvmin_thin(x, prunings=True): |
| | y = x |
| | for _i in range(32): |
| | y, is_done = thin_one_time(y, lvmin_kernels) |
| | if is_done: |
| | break |
| | if prunings: |
| | y, _ = thin_one_time(y, lvmin_prunings) |
| | return y |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | def pixel_perfect_resolution( |
| | image: np.ndarray, |
| | target_H: int, |
| | target_W: int, |
| | resize_mode: str, |
| | ) -> int: |
| | """ |
| | Calculate the estimated resolution for resizing an image while preserving aspect ratio. |
| | |
| | The function first calculates scaling factors for height and width of the image based on the target |
| | height and width. Then, based on the chosen resize mode, it either takes the smaller or the larger |
| | scaling factor to estimate the new resolution. |
| | |
| | If the resize mode is OUTER_FIT, the function uses the smaller scaling factor, ensuring the whole image |
| | fits within the target dimensions, potentially leaving some empty space. |
| | |
| | If the resize mode is not OUTER_FIT, the function uses the larger scaling factor, ensuring the target |
| | dimensions are fully filled, potentially cropping the image. |
| | |
| | After calculating the estimated resolution, the function prints some debugging information. |
| | |
| | Args: |
| | image (np.ndarray): A 3D numpy array representing an image. The dimensions represent [height, width, channels]. |
| | target_H (int): The target height for the image. |
| | target_W (int): The target width for the image. |
| | resize_mode (ResizeMode): The mode for resizing. |
| | |
| | Returns: |
| | int: The estimated resolution after resizing. |
| | """ |
| | raw_H, raw_W, _ = image.shape |
| |
|
| | k0 = float(target_H) / float(raw_H) |
| | k1 = float(target_W) / float(raw_W) |
| |
|
| | if resize_mode == "fill_resize": |
| | estimation = min(k0, k1) * float(min(raw_H, raw_W)) |
| | else: |
| | estimation = max(k0, k1) * float(min(raw_H, raw_W)) |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | return int(np.round(estimation)) |
| |
|
| |
|
| | def clone_contiguous(x: np.ndarray[Any, Any]) -> np.ndarray[Any, Any]: |
| | """Get a memory-contiguous clone of the given numpy array, as a safety measure and to improve computation efficiency.""" |
| | return np.ascontiguousarray(x).copy() |
| |
|
| |
|
| | def np_img_to_torch(np_img: np.ndarray[Any, Any], device: torch.device) -> torch.Tensor: |
| | """Convert a numpy image to a PyTorch tensor. The image is normalized to 0-1, rearranged to BCHW format and sent to |
| | the specified device.""" |
| |
|
| | torch_img = torch.from_numpy(np_img) |
| | normalized = torch_img.float() / 255.0 |
| | bchw = rearrange(normalized, "h w c -> 1 c h w") |
| | on_device = bchw.to(device) |
| | return on_device.clone() |
| |
|
| |
|
| | def heuristic_resize(np_img: np.ndarray[Any, Any], size: tuple[int, int]) -> np.ndarray[Any, Any]: |
| | """Resizes an image using a heuristic to choose the best resizing strategy. |
| | |
| | - If the image appears to be an edge map, special handling will be applied to ensure the edges are not distorted. |
| | - Single-pixel edge maps use NMS and thinning to keep the edges as single-pixel lines. |
| | - Low-color-count images are resized with nearest-neighbor to preserve color information (for e.g. segmentation maps). |
| | - The alpha channel is handled separately to ensure it is resized correctly. |
| | |
| | Args: |
| | np_img (np.ndarray): The input image. |
| | size (tuple[int, int]): The target size for the image. |
| | |
| | Returns: |
| | np.ndarray: The resized image. |
| | |
| | Adapted from https://github.com/Mikubill/sd-webui-controlnet. |
| | """ |
| |
|
| | |
| | if np_img.shape[0] == size[1] and np_img.shape[1] == size[0]: |
| | return np_img |
| |
|
| | |
| | inpaint_mask = None |
| | if np_img.ndim == 3 and np_img.shape[2] == 4: |
| | inpaint_mask = np_img[:, :, 3] |
| | np_img = np_img[:, :, 0:3] |
| |
|
| | new_size_is_smaller = (size[0] * size[1]) < (np_img.shape[0] * np_img.shape[1]) |
| | new_size_is_bigger = (size[0] * size[1]) > (np_img.shape[0] * np_img.shape[1]) |
| | unique_color_count = np.unique(np_img.reshape(-1, np_img.shape[2]), axis=0).shape[0] |
| | is_one_pixel_edge = False |
| | is_binary = False |
| |
|
| | if unique_color_count == 2: |
| | |
| | is_binary = np.min(np_img) < 16 and np.max(np_img) > 240 |
| | if is_binary: |
| | eroded = cv2.erode(np_img, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1) |
| | dilated = cv2.dilate(eroded, np.ones(shape=(3, 3), dtype=np.uint8), iterations=1) |
| | one_pixel_edge_count = np.where(dilated < np_img)[0].shape[0] |
| | all_edge_count = np.where(np_img > 127)[0].shape[0] |
| | is_one_pixel_edge = one_pixel_edge_count * 2 > all_edge_count |
| |
|
| | if 2 < unique_color_count < 200: |
| | |
| | |
| | interpolation = cv2.INTER_NEAREST |
| | elif new_size_is_smaller: |
| | |
| | interpolation = cv2.INTER_AREA |
| | else: |
| | |
| | interpolation = cv2.INTER_CUBIC |
| |
|
| | |
| | resized = cv2.resize(np_img, size, interpolation=interpolation) |
| |
|
| | if inpaint_mask is not None: |
| | |
| | inpaint_mask = cv2.resize(inpaint_mask, size, interpolation=interpolation) |
| |
|
| | |
| | if is_binary: |
| | resized = np.mean(resized.astype(np.float32), axis=2).clip(0, 255).astype(np.uint8) |
| | if is_one_pixel_edge: |
| | |
| | resized = nms(resized) |
| | _, resized = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) |
| | resized = lvmin_thin(resized, prunings=new_size_is_bigger) |
| | else: |
| | _, resized = cv2.threshold(resized, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU) |
| | resized = np.stack([resized] * 3, axis=2) |
| |
|
| | |
| | if inpaint_mask is not None: |
| | inpaint_mask = (inpaint_mask > 127).astype(np.float32) * 255.0 |
| | inpaint_mask = inpaint_mask[:, :, None].clip(0, 255).astype(np.uint8) |
| | resized = np.concatenate([resized, inpaint_mask], axis=2) |
| |
|
| | return resized |
| |
|
| |
|
| | |
| | |
| | |
| | |
| | def np_img_resize( |
| | np_img: np.ndarray, |
| | resize_mode: CONTROLNET_RESIZE_VALUES, |
| | h: int, |
| | w: int, |
| | device: torch.device = torch.device("cpu"), |
| | ) -> tuple[torch.Tensor, np.ndarray[Any, Any]]: |
| | np_img = normalize_image_channel_count(np_img) |
| |
|
| | if resize_mode == "just_resize": |
| | np_img = heuristic_resize(np_img, (w, h)) |
| | np_img = clone_contiguous(np_img) |
| | return np_img_to_torch(np_img, device), np_img |
| |
|
| | old_h, old_w, _ = np_img.shape |
| | old_w = float(old_w) |
| | old_h = float(old_h) |
| | k0 = float(h) / old_h |
| | k1 = float(w) / old_w |
| |
|
| | def safeint(x: Union[int, float]) -> int: |
| | return int(np.round(x)) |
| |
|
| | if resize_mode == "fill_resize": |
| | k = min(k0, k1) |
| | borders = np.concatenate([np_img[0, :, :], np_img[-1, :, :], np_img[:, 0, :], np_img[:, -1, :]], axis=0) |
| | high_quality_border_color = np.median(borders, axis=0).astype(np_img.dtype) |
| | if len(high_quality_border_color) == 4: |
| | |
| | high_quality_border_color[3] = 255 |
| | high_quality_background = np.tile(high_quality_border_color[None, None], [h, w, 1]) |
| | np_img = heuristic_resize(np_img, (safeint(old_w * k), safeint(old_h * k))) |
| | new_h, new_w, _ = np_img.shape |
| | pad_h = max(0, (h - new_h) // 2) |
| | pad_w = max(0, (w - new_w) // 2) |
| | high_quality_background[pad_h : pad_h + new_h, pad_w : pad_w + new_w] = np_img |
| | np_img = high_quality_background |
| | np_img = clone_contiguous(np_img) |
| | return np_img_to_torch(np_img, device), np_img |
| | else: |
| | k = max(k0, k1) |
| | np_img = heuristic_resize(np_img, (safeint(old_w * k), safeint(old_h * k))) |
| | new_h, new_w, _ = np_img.shape |
| | pad_h = max(0, (new_h - h) // 2) |
| | pad_w = max(0, (new_w - w) // 2) |
| | np_img = np_img[pad_h : pad_h + h, pad_w : pad_w + w] |
| | np_img = clone_contiguous(np_img) |
| | return np_img_to_torch(np_img, device), np_img |
| |
|
| |
|
| | def prepare_control_image( |
| | image: Image.Image, |
| | width: int, |
| | height: int, |
| | num_channels: int = 3, |
| | device: str | torch.device = "cuda", |
| | dtype: torch.dtype = torch.float16, |
| | control_mode: CONTROLNET_MODE_VALUES = "balanced", |
| | resize_mode: CONTROLNET_RESIZE_VALUES = "just_resize_simple", |
| | do_classifier_free_guidance: bool = True, |
| | ) -> torch.Tensor: |
| | """Pre-process images for ControlNets or T2I-Adapters. |
| | |
| | Args: |
| | image (Image): The PIL image to pre-process. |
| | width (int): The target width in pixels. |
| | height (int): The target height in pixels. |
| | num_channels (int, optional): The target number of image channels. This is achieved by converting the input |
| | image to RGB, then naively taking the first `num_channels` channels. The primary use case is converting a |
| | RGB image to a single-channel grayscale image. Raises if `num_channels` cannot be achieved. Defaults to 3. |
| | device (str | torch.Device, optional): The target device for the output image. Defaults to "cuda". |
| | dtype (_type_, optional): The dtype for the output image. Defaults to torch.float16. |
| | do_classifier_free_guidance (bool, optional): If True, repeat the output image along the batch dimension. |
| | Defaults to True. |
| | control_mode (str, optional): Defaults to "balanced". |
| | resize_mode (str, optional): Defaults to "just_resize_simple". |
| | |
| | Raises: |
| | ValueError: If `resize_mode` is not recognized. |
| | ValueError: If `num_channels` is out of range. |
| | |
| | Returns: |
| | torch.Tensor: The pre-processed input tensor. |
| | """ |
| | if resize_mode == "just_resize_simple": |
| | image = image.convert("RGB") |
| | image = image.resize((width, height), resample=Image.LANCZOS) |
| | nimage = np.array(image) |
| | nimage = nimage[None, :] |
| | nimage = np.concatenate([nimage], axis=0) |
| | |
| | nimage = np.array(nimage).astype(np.float32) / 255.0 |
| | nimage = nimage.transpose(0, 3, 1, 2) |
| | timage = torch.from_numpy(nimage) |
| |
|
| | |
| | elif resize_mode == "just_resize" or resize_mode == "crop_resize" or resize_mode == "fill_resize": |
| | nimage = np.array(image) |
| | timage, nimage = np_img_resize( |
| | np_img=nimage, |
| | resize_mode=resize_mode, |
| | h=height, |
| | w=width, |
| | device=torch.device(device), |
| | ) |
| | else: |
| | raise ValueError(f"Unsupported resize_mode: '{resize_mode}'.") |
| |
|
| | if timage.shape[1] < num_channels or num_channels <= 0: |
| | raise ValueError(f"Cannot achieve the target of num_channels={num_channels}.") |
| | timage = timage[:, :num_channels, :, :] |
| |
|
| | timage = timage.to(device=device, dtype=dtype) |
| | cfg_injection = control_mode == "more_control" or control_mode == "unbalanced" |
| | if do_classifier_free_guidance and not cfg_injection: |
| | timage = torch.cat([timage] * 2) |
| | return timage |
| |
|