Spaces:

fffiloni
/

x-decoder-video

Paused

App Files Files Community

fffiloni commited on Feb 16, 2023

Commit

20cfcc2

1 Parent(s): 01d3b78

Delete utils

Browse files

Files changed (22) hide show

utils/Config.py +0 -24
utils/__init__.py +0 -0
utils/__pycache__/__init__.cpython-38.pyc +0 -0
utils/__pycache__/arguments.cpython-38.pyc +0 -0
utils/__pycache__/ddim.cpython-38.pyc +0 -0
utils/__pycache__/distributed.cpython-38.pyc +0 -0
utils/__pycache__/inpainting.cpython-38.pyc +0 -0
utils/__pycache__/misc.cpython-38.pyc +0 -0
utils/__pycache__/model.cpython-38.pyc +0 -0
utils/__pycache__/model_loading.cpython-38.pyc +0 -0
utils/__pycache__/readme.txt +0 -0
utils/__pycache__/util.cpython-38.pyc +0 -0
utils/__pycache__/visualizer.cpython-38.pyc +0 -0
utils/arguments.py +0 -98
utils/ddim.py +0 -203
utils/distributed.py +0 -180
utils/inpainting.py +0 -177
utils/misc.py +0 -122
utils/model.py +0 -32
utils/model_loading.py +0 -42
utils/util.py +0 -283
utils/visualizer.py +0 -1278

utils/Config.py DELETED Viewed

@@ -1,24 +0,0 @@
-from fvcore.common.config import CfgNode as _CfgNode
-class CfgNode(_CfgNode):
-    """
-    The same as `fvcore.common.config.CfgNode`, but different in:
-    1. Use unsafe yaml loading by default.
-       Note that this may lead to arbitrary code execution: you must not
-       load a config file from untrusted sources before manually inspecting
-       the content of the file.
-    2. Support config versioning.
-       When attempting to merge an old config, it will convert the old config automatically.
-    .. automethod:: clone
-    .. automethod:: freeze
-    .. automethod:: defrost
-    .. automethod:: is_frozen
-    .. automethod:: load_yaml_with_base
-    .. automethod:: merge_from_list
-    .. automethod:: merge_from_other_cfg
-    """
-    def merge_from_dict(self, dict):
-        pass
-node = CfgNode()

utils/__init__.py DELETED Viewed

File without changes

utils/__pycache__/__init__.cpython-38.pyc DELETED Viewed

Binary file (130 Bytes)

utils/__pycache__/arguments.cpython-38.pyc DELETED Viewed

Binary file (3.98 kB)

utils/__pycache__/ddim.cpython-38.pyc DELETED Viewed

Binary file (6.17 kB)

utils/__pycache__/distributed.cpython-38.pyc DELETED Viewed

Binary file (4.9 kB)

utils/__pycache__/inpainting.cpython-38.pyc DELETED Viewed

Binary file (5.14 kB)

utils/__pycache__/misc.cpython-38.pyc DELETED Viewed

Binary file (3.71 kB)

utils/__pycache__/model.cpython-38.pyc DELETED Viewed

Binary file (817 Bytes)

utils/__pycache__/model_loading.cpython-38.pyc DELETED Viewed

Binary file (1.1 kB)

utils/__pycache__/readme.txt DELETED Viewed

File without changes

utils/__pycache__/util.cpython-38.pyc DELETED Viewed

Binary file (9.97 kB)

utils/__pycache__/visualizer.cpython-38.pyc DELETED Viewed

Binary file (43.1 kB)

utils/arguments.py DELETED Viewed

@@ -1,98 +0,0 @@
-import yaml
-import json
-import argparse
-import logging
-logger = logging.getLogger(__name__)
-def load_config_dict_to_opt(opt, config_dict):
-    """
-    Load the key, value pairs from config_dict to opt, overriding existing values in opt
-    if there is any.
-    """
-    if not isinstance(config_dict, dict):
-        raise TypeError("Config must be a Python dictionary")
-    for k, v in config_dict.items():
-        k_parts = k.split('.')
-        pointer = opt
-        for k_part in k_parts[:-1]:
-            if k_part not in pointer:
-                pointer[k_part] = {}
-            pointer = pointer[k_part]
-            assert isinstance(pointer, dict), "Overriding key needs to be inside a Python dict."
-        ori_value = pointer.get(k_parts[-1])
-        pointer[k_parts[-1]] = v
-        if ori_value:
-            logger.warning(f"Overrided {k} from {ori_value} to {pointer[k_parts[-1]]}")
-def load_opt_from_config_files(conf_file):
-    """
-    Load opt from the config files, settings in later files can override those in previous files.
-    Args:
-        conf_files: config file path
-    Returns:
-        dict: a dictionary of opt settings
-    """
-    opt = {}
-    with open(conf_file, encoding='utf-8') as f:
-        config_dict = yaml.safe_load(f)
-    load_config_dict_to_opt(opt, config_dict)
-    return opt
-def load_opt_command(args):
-    parser = argparse.ArgumentParser(description='MainzTrain: Pretrain or fine-tune models for NLP tasks.')
-    parser.add_argument('command', help='Command: train/evaluate/train-and-evaluate')
-    parser.add_argument('--conf_files', required=True, help='Path(s) to the MainzTrain config file(s).')
-    parser.add_argument('--config_overrides', nargs='*', help='Override parameters on config with a json style string, e.g. {"<PARAM_NAME_1>": <PARAM_VALUE_1>, "<PARAM_GROUP_2>.<PARAM_SUBGROUP_2>.<PARAM_2>": <PARAM_VALUE_2>}. A key with "." updates the object in the corresponding nested dict. Remember to escape " in command line.')
-    parser.add_argument('--overrides', help='arguments that used to overide the config file in cmdline', nargs=argparse.REMAINDER)
-    cmdline_args = parser.parse_args() if not args else parser.parse_args(args)
-    opt = load_opt_from_config_files(cmdline_args.conf_files)
-    if cmdline_args.config_overrides:
-        config_overrides_string = ' '.join(cmdline_args.config_overrides)
-        logger.warning(f"Command line config overrides: {config_overrides_string}")
-        config_dict = json.loads(config_overrides_string)
-        load_config_dict_to_opt(opt, config_dict)
-    if cmdline_args.overrides:
-        assert len(cmdline_args.overrides) % 2 == 0, "overides arguments is not paired, required: key value"
-        keys = [cmdline_args.overrides[idx*2] for idx in range(len(cmdline_args.overrides)//2)]
-        vals = [cmdline_args.overrides[idx*2+1] for idx in range(len(cmdline_args.overrides)//2)]
-        vals = [val.replace('false', '').replace('False','') if len(val.replace(' ', '')) == 5 else val for val in vals]
-        types = []
-        for key in keys:
-            key = key.split('.')
-            ele = opt.copy()
-            while len(key) > 0:
-                ele = ele[key.pop(0)]
-            types.append(type(ele))
-        config_dict = {x:z(y) for x,y,z in zip(keys, vals, types)}
-        load_config_dict_to_opt(opt, config_dict)
-    # combine cmdline_args into opt dictionary
-    for key, val in cmdline_args.__dict__.items():
-        if val is not None:
-            opt[key] = val
-    return opt, cmdline_args
-def save_opt_to_json(opt, conf_file):
-    with open(conf_file, 'w', encoding='utf-8') as f:
-        json.dump(opt, f, indent=4)
-def save_opt_to_yaml(opt, conf_file):
-    with open(conf_file, 'w', encoding='utf-8') as f:
-        yaml.dump(opt, f)

utils/ddim.py DELETED Viewed

@@ -1,203 +0,0 @@
-"""SAMPLING ONLY."""
-import torch
-import numpy as np
-from tqdm import tqdm
-from functools import partial
-from .util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like
-class DDIMSampler(object):
-    def __init__(self, model, schedule="linear", **kwargs):
-        super().__init__()
-        self.model = model
-        self.ddpm_num_timesteps = model.num_timesteps
-        self.schedule = schedule
-    def register_buffer(self, name, attr):
-        if type(attr) == torch.Tensor:
-            if attr.device != torch.device("cuda"):
-                attr = attr.to(torch.device("cuda"))
-        setattr(self, name, attr)
-    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
-        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
-                                                  num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose)
-        alphas_cumprod = self.model.alphas_cumprod
-        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
-        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
-        self.register_buffer('betas', to_torch(self.model.betas))
-        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
-        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
-        # calculations for diffusion q(x_t | x_{t-1}) and others
-        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
-        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
-        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
-        # ddim sampling parameters
-        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
-                                                                                   ddim_timesteps=self.ddim_timesteps,
-                                                                                   eta=ddim_eta,verbose=verbose)
-        self.register_buffer('ddim_sigmas', ddim_sigmas)
-        self.register_buffer('ddim_alphas', ddim_alphas)
-        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
-        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
-        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
-            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
-                        1 - self.alphas_cumprod / self.alphas_cumprod_prev))
-        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
-    @torch.no_grad()
-    def sample(self,
-               S,
-               batch_size,
-               shape,
-               conditioning=None,
-               callback=None,
-               normals_sequence=None,
-               img_callback=None,
-               quantize_x0=False,
-               eta=0.,
-               mask=None,
-               x0=None,
-               temperature=1.,
-               noise_dropout=0.,
-               score_corrector=None,
-               corrector_kwargs=None,
-               verbose=True,
-               x_T=None,
-               log_every_t=100,
-               unconditional_guidance_scale=1.,
-               unconditional_conditioning=None,
-               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
-               **kwargs
-               ):
-        if conditioning is not None:
-            if isinstance(conditioning, dict):
-                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
-                if cbs != batch_size:
-                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
-            else:
-                if conditioning.shape[0] != batch_size:
-                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
-        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
-        # sampling
-        C, H, W = shape
-        size = (batch_size, C, H, W)
-        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
-        samples, intermediates = self.ddim_sampling(conditioning, size,
-                                                    callback=callback,
-                                                    img_callback=img_callback,
-                                                    quantize_denoised=quantize_x0,
-                                                    mask=mask, x0=x0,
-                                                    ddim_use_original_steps=False,
-                                                    noise_dropout=noise_dropout,
-                                                    temperature=temperature,
-                                                    score_corrector=score_corrector,
-                                                    corrector_kwargs=corrector_kwargs,
-                                                    x_T=x_T,
-                                                    log_every_t=log_every_t,
-                                                    unconditional_guidance_scale=unconditional_guidance_scale,
-                                                    unconditional_conditioning=unconditional_conditioning,
-                                                    )
-        return samples, intermediates
-    @torch.no_grad()
-    def ddim_sampling(self, cond, shape,
-                      x_T=None, ddim_use_original_steps=False,
-                      callback=None, timesteps=None, quantize_denoised=False,
-                      mask=None, x0=None, img_callback=None, log_every_t=100,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None,):
-        device = self.model.betas.device
-        b = shape[0]
-        if x_T is None:
-            img = torch.randn(shape, device=device)
-        else:
-            img = x_T
-        if timesteps is None:
-            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
-        elif timesteps is not None and not ddim_use_original_steps:
-            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
-            timesteps = self.ddim_timesteps[:subset_end]
-        intermediates = {'x_inter': [img], 'pred_x0': [img]}
-        time_range = reversed(range(0,timesteps)) if ddim_use_original_steps else np.flip(timesteps)
-        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
-        print(f"Running DDIM Sampling with {total_steps} timesteps")
-        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
-        for i, step in enumerate(iterator):
-            index = total_steps - i - 1
-            ts = torch.full((b,), step, device=device, dtype=torch.long)
-            if mask is not None:
-                assert x0 is not None
-                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
-                img = img_orig * mask + (1. - mask) * img
-            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
-                                      quantize_denoised=quantize_denoised, temperature=temperature,
-                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
-                                      corrector_kwargs=corrector_kwargs,
-                                      unconditional_guidance_scale=unconditional_guidance_scale,
-                                      unconditional_conditioning=unconditional_conditioning)
-            img, pred_x0 = outs
-            if callback: callback(i)
-            if img_callback: img_callback(pred_x0, i)
-            if index % log_every_t == 0 or index == total_steps - 1:
-                intermediates['x_inter'].append(img)
-                intermediates['pred_x0'].append(pred_x0)
-        return img, intermediates
-    @torch.no_grad()
-    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
-                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
-                      unconditional_guidance_scale=1., unconditional_conditioning=None):
-        b, *_, device = *x.shape, x.device
-        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
-            e_t = self.model.apply_model(x, t, c)
-        else:
-            x_in = torch.cat([x] * 2)
-            t_in = torch.cat([t] * 2)
-            c_in = torch.cat([unconditional_conditioning, c])
-            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
-            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
-        if score_corrector is not None:
-            assert self.model.parameterization == "eps"
-            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
-        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
-        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
-        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
-        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
-        # select parameters corresponding to the currently considered timestep
-        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
-        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
-        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
-        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device)
-        # current prediction for x_0
-        pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
-        if quantize_denoised:
-            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
-        # direction pointing to x_t
-        dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t
-        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
-        if noise_dropout > 0.:
-            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
-        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
-        return x_prev, pred_x0

utils/distributed.py DELETED Viewed

@@ -1,180 +0,0 @@
-import os
-import time
-import torch
-import pickle
-import torch.distributed as dist
-def init_distributed(opt):
-    opt['CUDA'] = opt.get('CUDA', True) and torch.cuda.is_available()
-    if 'OMPI_COMM_WORLD_SIZE' not in os.environ:
-        # application was started without MPI
-        # default to single node with single process
-        opt['env_info'] = 'no MPI'
-        opt['world_size'] = 1
-        opt['local_size'] = 1
-        opt['rank'] = 0
-        opt['local_rank'] = 0
-        opt['master_address'] = '127.0.0.1'
-        opt['master_port'] = '8673'
-    else:
-        # application was started with MPI
-        # get MPI parameters
-        opt['world_size'] = int(os.environ['OMPI_COMM_WORLD_SIZE'])
-        opt['local_size'] = int(os.environ['OMPI_COMM_WORLD_LOCAL_SIZE'])
-        opt['rank'] = int(os.environ['OMPI_COMM_WORLD_RANK'])
-        opt['local_rank'] = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
-    # set up device
-    if not opt['CUDA']:
-        assert opt['world_size'] == 1, 'multi-GPU training without CUDA is not supported since we use NCCL as communication backend'
-        opt['device'] = torch.device("cpu")
-    else:
-        torch.cuda.set_device(opt['local_rank'])
-        opt['device'] = torch.device("cuda", opt['local_rank'])
-    return opt
-def is_main_process():
-    rank = 0
-    if 'OMPI_COMM_WORLD_SIZE' in os.environ:
-        rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
-    return rank == 0
-def get_world_size():
-    if not dist.is_available():
-        return 1
-    if not dist.is_initialized():
-        return 1
-    return dist.get_world_size()
-def get_rank():
-    if not dist.is_available():
-        return 0
-    if not dist.is_initialized():
-        return 0
-    return dist.get_rank()
-def synchronize():
-    """
-    Helper function to synchronize (barrier) among all processes when
-    using distributed training
-    """
-    if not dist.is_available():
-        return
-    if not dist.is_initialized():
-        return
-    world_size = dist.get_world_size()
-    rank = dist.get_rank()
-    if world_size == 1:
-        return
-    def _send_and_wait(r):
-        if rank == r:
-            tensor = torch.tensor(0, device="cuda")
-        else:
-            tensor = torch.tensor(1, device="cuda")
-        dist.broadcast(tensor, r)
-        while tensor.item() == 1:
-            time.sleep(1)
-    _send_and_wait(0)
-    # now sync on the main process
-    _send_and_wait(1)
-def all_gather(data):
-    """
-    Run all_gather on arbitrary picklable data (not necessarily tensors)
-    Args:
-        data: any picklable object
-    Returns:
-        list[data]: list of data gathered from each rank
-    """
-    world_size = get_world_size()
-    if world_size == 1:
-        return [data]
-    # serialized to a Tensor
-    buffer = pickle.dumps(data)
-    storage = torch.ByteStorage.from_buffer(buffer)
-    tensor = torch.ByteTensor(storage).to("cuda")
-    # obtain Tensor size of each rank
-    local_size = torch.IntTensor([tensor.numel()]).to("cuda")
-    size_list = [torch.IntTensor([0]).to("cuda") for _ in range(world_size)]
-    dist.all_gather(size_list, local_size)
-    size_list = [int(size.item()) for size in size_list]
-    max_size = max(size_list)
-    # receiving Tensor from all ranks
-    # we pad the tensor because torch all_gather does not support
-    # gathering tensors of different shapes
-    tensor_list = []
-    for _ in size_list:
-        tensor_list.append(torch.ByteTensor(size=(max_size,)).to("cuda"))
-    if local_size != max_size:
-        padding = torch.ByteTensor(size=(max_size - local_size,)).to("cuda")
-        tensor = torch.cat((tensor, padding), dim=0)
-    dist.all_gather(tensor_list, tensor)
-    data_list = []
-    for size, tensor in zip(size_list, tensor_list):
-        buffer = tensor.cpu().numpy().tobytes()[:size]
-        data_list.append(pickle.loads(buffer))
-    return data_list
-def reduce_dict(input_dict, average=True):
-    """
-    Args:
-        input_dict (dict): all the values will be reduced
-        average (bool): whether to do average or sum
-    Reduce the values in the dictionary from all processes so that process with rank
-    0 has the averaged results. Returns a dict with the same fields as
-    input_dict, after reduction.
-    """
-    world_size = get_world_size()
-    if world_size < 2:
-        return input_dict
-    with torch.no_grad():
-        names = []
-        values = []
-        # sort the keys so that they are consistent across processes
-        for k in sorted(input_dict.keys()):
-            names.append(k)
-            values.append(input_dict[k])
-        values = torch.stack(values, dim=0)
-        dist.reduce(values, dst=0)
-        if dist.get_rank() == 0 and average:
-            # only main process gets accumulated, so only divide by
-            # world_size in this case
-            values /= world_size
-        reduced_dict = {k: v for k, v in zip(names, values)}
-    return reduced_dict
-def broadcast_data(data):
-    if not torch.distributed.is_initialized():
-        return data
-    rank = dist.get_rank()
-    if rank == 0:
-        data_tensor = torch.tensor(data + [0], device="cuda")
-    else:
-        data_tensor = torch.tensor(data + [1], device="cuda")
-    torch.distributed.broadcast(data_tensor, 0)
-    while data_tensor.cpu().numpy()[-1] == 1:
-        time.sleep(1)
-    return data_tensor.cpu().numpy().tolist()[:-1]
-def reduce_sum(tensor):
-    if get_world_size() <= 1:
-        return tensor
-    tensor = tensor.clone()
-    dist.all_reduce(tensor, op=dist.ReduceOp.SUM)
-    return tensor

utils/inpainting.py DELETED Viewed

@@ -1,177 +0,0 @@
-import sys
-import cv2
-import torch
-import numpy as np
-import gradio as gr
-from PIL import Image
-from omegaconf import OmegaConf
-from einops import repeat
-from imwatermark import WatermarkEncoder
-from pathlib import Path
-from .ddim import DDIMSampler
-from .util import instantiate_from_config
-torch.set_grad_enabled(False)
-def put_watermark(img, wm_encoder=None):
-    if wm_encoder is not None:
-        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-        img = wm_encoder.encode(img, 'dwtDct')
-        img = Image.fromarray(img[:, :, ::-1])
-    return img
-def initialize_model(config, ckpt):
-    config = OmegaConf.load(config)
-    model = instantiate_from_config(config.model)
-    model.load_state_dict(torch.load(ckpt)["state_dict"], strict=False)
-    device = torch.device(
-        "cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = model.to(device)
-    sampler = DDIMSampler(model)
-    return sampler
-def make_batch_sd(
-        image,
-        mask,
-        txt,
-        device,
-        num_samples=1):
-    image = np.array(image.convert("RGB"))
-    image = image[None].transpose(0, 3, 1, 2)
-    image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0
-    mask = np.array(mask.convert("L"))
-    mask = mask.astype(np.float32) / 255.0
-    mask = mask[None, None]
-    mask[mask < 0.5] = 0
-    mask[mask >= 0.5] = 1
-    mask = torch.from_numpy(mask)
-    masked_image = image * (mask < 0.5)
-    batch = {
-        "image": repeat(image.to(device=device), "1 ... -> n ...", n=num_samples),
-        "txt": num_samples * [txt],
-        "mask": repeat(mask.to(device=device), "1 ... -> n ...", n=num_samples),
-        "masked_image": repeat(masked_image.to(device=device), "1 ... -> n ...", n=num_samples),
-    }
-    return batch
-@torch.no_grad()
-def inpaint(sampler, image, mask, prompt, seed, scale, ddim_steps, num_samples=1, w=512, h=512):
-    device = torch.device(
-        "cuda") if torch.cuda.is_available() else torch.device("cpu")
-    model = sampler.model
-    print("Creating invisible watermark encoder (see https://github.com/ShieldMnt/invisible-watermark)...")
-    wm = "SDV2"
-    wm_encoder = WatermarkEncoder()
-    wm_encoder.set_watermark('bytes', wm.encode('utf-8'))
-    prng = np.random.RandomState(seed)
-    start_code = prng.randn(num_samples, 4, h // 8, w // 8)
-    start_code = torch.from_numpy(start_code).to(
-        device=device, dtype=torch.float32)
-    with torch.no_grad(), \
-            torch.autocast("cuda"):
-        batch = make_batch_sd(image, mask, txt=prompt,
-                              device=device, num_samples=num_samples)
-        c = model.cond_stage_model.encode(batch["txt"])
-        c_cat = list()
-        for ck in model.concat_keys:
-            cc = batch[ck].float()
-            if ck != model.masked_image_key:
-                bchw = [num_samples, 4, h // 8, w // 8]
-                cc = torch.nn.functional.interpolate(cc, size=bchw[-2:])
-            else:
-                cc = model.get_first_stage_encoding(
-                    model.encode_first_stage(cc))
-            c_cat.append(cc)
-        c_cat = torch.cat(c_cat, dim=1)
-        # cond
-        cond = {"c_concat": [c_cat], "c_crossattn": [c]}
-        # uncond cond
-        uc_cross = model.get_unconditional_conditioning(num_samples, "")
-        uc_full = {"c_concat": [c_cat], "c_crossattn": [uc_cross]}
-        shape = [model.channels, h // 8, w // 8]
-        samples_cfg, intermediates = sampler.sample(
-            ddim_steps,
-            num_samples,
-            shape,
-            cond,
-            verbose=False,
-            eta=1.0,
-            unconditional_guidance_scale=scale,
-            unconditional_conditioning=uc_full,
-            x_T=start_code,
-        )
-        x_samples_ddim = model.decode_first_stage(samples_cfg)
-        result = torch.clamp((x_samples_ddim + 1.0) / 2.0,
-                             min=0.0, max=1.0)
-        result = result.cpu().numpy().transpose(0, 2, 3, 1) * 255
-    return [put_watermark(Image.fromarray(img.astype(np.uint8)), wm_encoder) for img in result]
-def pad_image(input_image):
-    pad_w, pad_h = np.max(((2, 2), np.ceil(
-        np.array(input_image.size) / 64).astype(int)), axis=0) * 64 - input_image.size
-    im_padded = Image.fromarray(
-        np.pad(np.array(input_image), ((0, pad_h), (0, pad_w), (0, 0)), mode='edge'))
-    return im_padded
-def crop_image(input_image):
-    crop_w, crop_h = np.floor(np.array(input_image.size) / 64).astype(int) * 64
-    im_cropped = Image.fromarray(np.array(input_image)[:crop_h, :crop_w])
-    return im_cropped
-# sampler = initialize_model(sys.argv[1], sys.argv[2])
-@torch.no_grad()
-def predict(model, input_image, prompt, ddim_steps, num_samples, scale, seed):
-    """_summary_
-    Args:
-        input_image (_type_): dict
-            - image: PIL.Image. Input image.
-            - mask: PIL.Image. Mask image.
-        prompt (_type_): string to be used as prompt.
-        ddim_steps (_type_): typical 45
-        num_samples (_type_): typical 4
-        scale (_type_): typical 10.0 Guidance Scale.
-        seed (_type_): typical 1529160519
-    """
-    init_image = input_image["image"].convert("RGB")
-    init_mask = input_image["mask"].convert("RGB")
-    image = pad_image(init_image) # resize to integer multiple of 32
-    mask = pad_image(init_mask) # resize to integer multiple of 32
-    width, height = image.size
-    print("Inpainting...", width, height)
-    result = inpaint(
-        sampler=model,
-        image=image,
-        mask=mask,
-        prompt=prompt,
-        seed=seed,
-        scale=scale,
-        ddim_steps=ddim_steps,
-        num_samples=num_samples,
-        h=height, w=width
-    )
-    return result

utils/misc.py DELETED Viewed

@@ -1,122 +0,0 @@
-import math
-import numpy as np
-def get_prompt_templates():
-    prompt_templates = [
-        '{}.',
-        'a photo of a {}.',
-        'a bad photo of a {}.',
-        'a photo of many {}.',
-        'a sculpture of a {}.',
-        'a photo of the hard to see {}.',
-        'a low resolution photo of the {}.',
-        'a rendering of a {}.',
-        'graffiti of a {}.',
-        'a bad photo of the {}.',
-        'a cropped photo of the {}.',
-        'a tattoo of a {}.',
-        'the embroidered {}.',
-        'a photo of a hard to see {}.',
-        'a bright photo of a {}.',
-        'a photo of a clean {}.',
-        'a photo of a dirty {}.',
-        'a dark photo of the {}.',
-        'a drawing of a {}.',
-        'a photo of my {}.',
-        'the plastic {}.',
-        'a photo of the cool {}.',
-        'a close-up photo of a {}.',
-        'a black and white photo of the {}.',
-        'a painting of the {}.',
-        'a painting of a {}.',
-        'a pixelated photo of the {}.',
-        'a sculpture of the {}.',
-        'a bright photo of the {}.',
-        'a cropped photo of a {}.',
-        'a plastic {}.',
-        'a photo of the dirty {}.',
-        'a jpeg corrupted photo of a {}.',
-        'a blurry photo of the {}.',
-        'a photo of the {}.',
-        'a good photo of the {}.',
-        'a rendering of the {}.',
-        'a {} in a video game.',
-        'a photo of one {}.',
-        'a doodle of a {}.',
-        'a close-up photo of the {}.',
-        'the origami {}.',
-        'the {} in a video game.',
-        'a sketch of a {}.',
-        'a doodle of the {}.',
-        'a origami {}.',
-        'a low resolution photo of a {}.',
-        'the toy {}.',
-        'a rendition of the {}.',
-        'a photo of the clean {}.',
-        'a photo of a large {}.',
-        'a rendition of a {}.',
-        'a photo of a nice {}.',
-        'a photo of a weird {}.',
-        'a blurry photo of a {}.',
-        'a cartoon {}.',
-        'art of a {}.',
-        'a sketch of the {}.',
-        'a embroidered {}.',
-        'a pixelated photo of a {}.',
-        'itap of the {}.',
-        'a jpeg corrupted photo of the {}.',
-        'a good photo of a {}.',
-        'a plushie {}.',
-        'a photo of the nice {}.',
-        'a photo of the small {}.',
-        'a photo of the weird {}.',
-        'the cartoon {}.',
-        'art of the {}.',
-        'a drawing of the {}.',
-        'a photo of the large {}.',
-        'a black and white photo of a {}.',
-        'the plushie {}.',
-        'a dark photo of a {}.',
-        'itap of a {}.',
-        'graffiti of the {}.',
-        'a toy {}.',
-        'itap of my {}.',
-        'a photo of a cool {}.',
-        'a photo of a small {}.',
-        'a tattoo of the {}.',
-    ]
-    return prompt_templates
-def prompt_engineering(classnames, topk=1, suffix='.'):
-    prompt_templates = get_prompt_templates()
-    temp_idx = np.random.randint(min(len(prompt_templates), topk))
-    if isinstance(classnames, list):
-        classname = random.choice(classnames)
-    else:
-        classname = classnames
-    return prompt_templates[temp_idx].replace('.', suffix).format(classname.replace(',', '').replace('+', ' '))
-class AverageMeter(object):
-    """Computes and stores the average and current value."""
-    def __init__(self):
-        self.reset()
-    def reset(self):
-        self.val = 0
-        self.avg = 0
-        self.sum = 0
-        self.count = 0
-    def update(self, val, n=1, decay=0):
-        self.val = val
-        if decay:
-            alpha = math.exp(-n / decay)  # exponential decay over 100 updates
-            self.sum = alpha * self.sum + (1 - alpha) * val * n
-            self.count = alpha * self.count + (1 - alpha) * n
-        else:
-            self.sum += val * n
-            self.count += n
-        self.avg = self.sum / self.count

utils/model.py DELETED Viewed

@@ -1,32 +0,0 @@
-import logging
-import os
-import time
-import pickle
-import torch
-import torch.distributed as dist
-from fvcore.nn import FlopCountAnalysis
-from fvcore.nn import flop_count_table
-from fvcore.nn import flop_count_str
-logger = logging.getLogger(__name__)
-NORM_MODULES = [
-    torch.nn.BatchNorm1d,
-    torch.nn.BatchNorm2d,
-    torch.nn.BatchNorm3d,
-    torch.nn.SyncBatchNorm,
-    # NaiveSyncBatchNorm inherits from BatchNorm2d
-    torch.nn.GroupNorm,
-    torch.nn.InstanceNorm1d,
-    torch.nn.InstanceNorm2d,
-    torch.nn.InstanceNorm3d,
-    torch.nn.LayerNorm,
-    torch.nn.LocalResponseNorm,
-]
-def register_norm_module(cls):
-    NORM_MODULES.append(cls)
-    return cls

utils/model_loading.py DELETED Viewed

@@ -1,42 +0,0 @@
-# --------------------------------------------------------
-# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
-# Copyright (c) 2022 Microsoft
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Xueyan Zou (xueyan@cs.wisc.edu)
-# --------------------------------------------------------
-import logging
-from utils.distributed import is_main_process
-logger = logging.getLogger(__name__)
-def align_and_update_state_dicts(model_state_dict, ckpt_state_dict):
-    model_keys = sorted(model_state_dict.keys())
-    ckpt_keys = sorted(ckpt_state_dict.keys())
-    result_dicts = {}
-    matched_log = []
-    unmatched_log = []
-    unloaded_log = []
-    for model_key in model_keys:
-        model_weight = model_state_dict[model_key]
-        if model_key in ckpt_keys:
-            ckpt_weight = ckpt_state_dict[model_key]
-            if model_weight.shape == ckpt_weight.shape:
-                result_dicts[model_key] = ckpt_weight
-                ckpt_keys.pop(ckpt_keys.index(model_key))
-                matched_log.append("Loaded {}, Model Shape: {} <-> Ckpt Shape: {}".format(model_key, model_weight.shape, ckpt_weight.shape))
-            else:
-                unmatched_log.append("*UNMATCHED* {}, Model Shape: {} <-> Ckpt Shape: {}".format(model_key, model_weight.shape, ckpt_weight.shape))
-        else:
-            unloaded_log.append("*UNLOADED* {}, Model Shape: {}".format(model_key, model_weight.shape))
-    if is_main_process():
-        for info in matched_log:
-            logger.info(info)
-        for info in unloaded_log:
-            logger.warning(info)
-        for key in ckpt_keys:
-            logger.warning("$UNUSED$ {}, Ckpt Shape: {}".format(key, ckpt_state_dict[key].shape))
-        for info in unmatched_log:
-            logger.warning(info)
-    return result_dicts

utils/util.py DELETED Viewed

@@ -1,283 +0,0 @@
-# adopted from
-# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py
-# and
-# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py
-# and
-# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py
-#
-# thanks!
-import importlib
-import os
-import math
-import torch
-import torch.nn as nn
-import numpy as np
-from einops import repeat
-def instantiate_from_config(config):
-    if not "target" in config:
-        if config == '__is_first_stage__':
-            return None
-        elif config == "__is_unconditional__":
-            return None
-        raise KeyError("Expected key `target` to instantiate.")
-    return get_obj_from_str(config["target"])(**config.get("params", dict()))
-def get_obj_from_str(string, reload=False):
-    module, cls = string.rsplit(".", 1)
-    if reload:
-        module_imp = importlib.import_module(module)
-        importlib.reload(module_imp)
-    return getattr(importlib.import_module(module, package=None), cls)
-def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3):
-    if schedule == "linear":
-        betas = (
-                torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2
-        )
-    elif schedule == "cosine":
-        timesteps = (
-                torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s
-        )
-        alphas = timesteps / (1 + cosine_s) * np.pi / 2
-        alphas = torch.cos(alphas).pow(2)
-        alphas = alphas / alphas[0]
-        betas = 1 - alphas[1:] / alphas[:-1]
-        betas = np.clip(betas, a_min=0, a_max=0.999)
-    elif schedule == "sqrt_linear":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64)
-    elif schedule == "sqrt":
-        betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5
-    else:
-        raise ValueError(f"schedule '{schedule}' unknown.")
-    return betas.numpy()
-def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True):
-    if ddim_discr_method == 'uniform':
-        c = num_ddpm_timesteps // num_ddim_timesteps
-        ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c)))
-    elif ddim_discr_method == 'quad':
-        ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int)
-    else:
-        raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"')
-    # assert ddim_timesteps.shape[0] == num_ddim_timesteps
-    # add one to get the final alpha values right (the ones from first scale to data during sampling)
-    steps_out = ddim_timesteps + 1
-    if verbose:
-        print(f'Selected timesteps for ddim sampler: {steps_out}')
-    return steps_out
-def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True):
-    # select alphas for computing the variance schedule
-    alphas = alphacums[ddim_timesteps]
-    alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist())
-    # according the the formula provided in https://arxiv.org/abs/2010.02502
-    sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev))
-    if verbose:
-        print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}')
-        print(f'For the chosen value of eta, which is {eta}, '
-              f'this results in the following sigma_t schedule for ddim sampler {sigmas}')
-    return sigmas, alphas, alphas_prev
-def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999):
-    """
-    Create a beta schedule that discretizes the given alpha_t_bar function,
-    which defines the cumulative product of (1-beta) over time from t = [0,1].
-    :param num_diffusion_timesteps: the number of betas to produce.
-    :param alpha_bar: a lambda that takes an argument t from 0 to 1 and
-                      produces the cumulative product of (1-beta) up to that
-                      part of the diffusion process.
-    :param max_beta: the maximum beta to use; use values lower than 1 to
-                     prevent singularities.
-    """
-    betas = []
-    for i in range(num_diffusion_timesteps):
-        t1 = i / num_diffusion_timesteps
-        t2 = (i + 1) / num_diffusion_timesteps
-        betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta))
-    return np.array(betas)
-def extract_into_tensor(a, t, x_shape):
-    b, *_ = t.shape
-    out = a.gather(-1, t)
-    return out.reshape(b, *((1,) * (len(x_shape) - 1)))
-def checkpoint(func, inputs, params, flag):
-    """
-    Evaluate a function without caching intermediate activations, allowing for
-    reduced memory at the expense of extra compute in the backward pass.
-    :param func: the function to evaluate.
-    :param inputs: the argument sequence to pass to `func`.
-    :param params: a sequence of parameters `func` depends on but does not
-                   explicitly take as arguments.
-    :param flag: if False, disable gradient checkpointing.
-    """
-    if flag:
-        args = tuple(inputs) + tuple(params)
-        return CheckpointFunction.apply(func, len(inputs), *args)
-    else:
-        return func(*inputs)
-class CheckpointFunction(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, run_function, length, *args):
-        ctx.run_function = run_function
-        ctx.input_tensors = list(args[:length])
-        ctx.input_params = list(args[length:])
-        with torch.no_grad():
-            output_tensors = ctx.run_function(*ctx.input_tensors)
-        return output_tensors
-    @staticmethod
-    def backward(ctx, *output_grads):
-        ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors]
-        with torch.enable_grad():
-            # Fixes a bug where the first op in run_function modifies the
-            # Tensor storage in place, which is not allowed for detach()'d
-            # Tensors.
-            shallow_copies = [x.view_as(x) for x in ctx.input_tensors]
-            output_tensors = ctx.run_function(*shallow_copies)
-        input_grads = torch.autograd.grad(
-            output_tensors,
-            ctx.input_tensors + ctx.input_params,
-            output_grads,
-            allow_unused=True,
-        )
-        del ctx.input_tensors
-        del ctx.input_params
-        del output_tensors
-        return (None, None) + input_grads
-def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False):
-    """
-    Create sinusoidal timestep embeddings.
-    :param timesteps: a 1-D Tensor of N indices, one per batch element.
-                      These may be fractional.
-    :param dim: the dimension of the output.
-    :param max_period: controls the minimum frequency of the embeddings.
-    :return: an [N x dim] Tensor of positional embeddings.
-    """
-    if not repeat_only:
-        half = dim // 2
-        freqs = torch.exp(
-            -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half
-        ).to(device=timesteps.device)
-        args = timesteps[:, None].float() * freqs[None]
-        embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
-        if dim % 2:
-            embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
-    else:
-        embedding = repeat(timesteps, 'b -> b d', d=dim)
-    return embedding
-def zero_module(module):
-    """
-    Zero out the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().zero_()
-    return module
-def scale_module(module, scale):
-    """
-    Scale the parameters of a module and return it.
-    """
-    for p in module.parameters():
-        p.detach().mul_(scale)
-    return module
-def mean_flat(tensor):
-    """
-    Take the mean over all non-batch dimensions.
-    """
-    return tensor.mean(dim=list(range(1, len(tensor.shape))))
-def normalization(channels):
-    """
-    Make a standard normalization layer.
-    :param channels: number of input channels.
-    :return: an nn.Module for normalization.
-    """
-    return GroupNorm32(32, channels)
-# PyTorch 1.7 has SiLU, but we support PyTorch 1.5.
-class SiLU(nn.Module):
-    def forward(self, x):
-        return x * torch.sigmoid(x)
-class GroupNorm32(nn.GroupNorm):
-    def forward(self, x):
-        return super().forward(x.float()).type(x.dtype)
-def conv_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D convolution module.
-    """
-    if dims == 1:
-        return nn.Conv1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.Conv2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.Conv3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-def linear(*args, **kwargs):
-    """
-    Create a linear module.
-    """
-    return nn.Linear(*args, **kwargs)
-def avg_pool_nd(dims, *args, **kwargs):
-    """
-    Create a 1D, 2D, or 3D average pooling module.
-    """
-    if dims == 1:
-        return nn.AvgPool1d(*args, **kwargs)
-    elif dims == 2:
-        return nn.AvgPool2d(*args, **kwargs)
-    elif dims == 3:
-        return nn.AvgPool3d(*args, **kwargs)
-    raise ValueError(f"unsupported dimensions: {dims}")
-class HybridConditioner(nn.Module):
-    def __init__(self, c_concat_config, c_crossattn_config):
-        super().__init__()
-        self.concat_conditioner = instantiate_from_config(c_concat_config)
-        self.crossattn_conditioner = instantiate_from_config(c_crossattn_config)
-    def forward(self, c_concat, c_crossattn):
-        c_concat = self.concat_conditioner(c_concat)
-        c_crossattn = self.crossattn_conditioner(c_crossattn)
-        return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]}
-def noise_like(shape, device, repeat=False):
-    repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1)))
-    noise = lambda: torch.randn(shape, device=device)
-    return repeat_noise() if repeat else noise()

utils/visualizer.py DELETED Viewed

@@ -1,1278 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-import colorsys
-import logging
-import math
-import numpy as np
-from enum import Enum, unique
-import cv2
-import matplotlib as mpl
-import matplotlib.colors as mplc
-import matplotlib.figure as mplfigure
-import pycocotools.mask as mask_util
-import torch
-from matplotlib.backends.backend_agg import FigureCanvasAgg
-from PIL import Image
-from detectron2.data import MetadataCatalog
-from detectron2.structures import BitMasks, Boxes, BoxMode, Keypoints, PolygonMasks, RotatedBoxes
-from detectron2.utils.file_io import PathManager
-from detectron2.utils.colormap import random_color
-logger = logging.getLogger(__name__)
-__all__ = ["ColorMode", "VisImage", "Visualizer"]
-_SMALL_OBJECT_AREA_THRESH = 1000
-_LARGE_MASK_AREA_THRESH = 120000
-_OFF_WHITE = (1.0, 1.0, 240.0 / 255)
-_BLACK = (0, 0, 0)
-_RED = (1.0, 0, 0)
-_KEYPOINT_THRESHOLD = 0.05
-@unique
-class ColorMode(Enum):
-    """
-    Enum of different color modes to use for instance visualizations.
-    """
-    IMAGE = 0
-    """
-    Picks a random color for every instance and overlay segmentations with low opacity.
-    """
-    SEGMENTATION = 1
-    """
-    Let instances of the same category have similar colors
-    (from metadata.thing_colors), and overlay them with
-    high opacity. This provides more attention on the quality of segmentation.
-    """
-    IMAGE_BW = 2
-    """
-    Same as IMAGE, but convert all areas without masks to gray-scale.
-    Only available for drawing per-instance mask predictions.
-    """
-class GenericMask:
-    """
-    Attribute:
-        polygons (list[ndarray]): list[ndarray]: polygons for this mask.
-            Each ndarray has format [x, y, x, y, ...]
-        mask (ndarray): a binary mask
-    """
-    def __init__(self, mask_or_polygons, height, width):
-        self._mask = self._polygons = self._has_holes = None
-        self.height = height
-        self.width = width
-        m = mask_or_polygons
-        if isinstance(m, dict):
-            # RLEs
-            assert "counts" in m and "size" in m
-            if isinstance(m["counts"], list):  # uncompressed RLEs
-                h, w = m["size"]
-                assert h == height and w == width
-                m = mask_util.frPyObjects(m, h, w)
-            self._mask = mask_util.decode(m)[:, :]
-            return
-        if isinstance(m, list):  # list[ndarray]
-            self._polygons = [np.asarray(x).reshape(-1) for x in m]
-            return
-        if isinstance(m, np.ndarray):  # assumed to be a binary mask
-            assert m.shape[1] != 2, m.shape
-            assert m.shape == (
-                height,
-                width,
-            ), f"mask shape: {m.shape}, target dims: {height}, {width}"
-            self._mask = m.astype("uint8")
-            return
-        raise ValueError("GenericMask cannot handle object {} of type '{}'".format(m, type(m)))
-    @property
-    def mask(self):
-        if self._mask is None:
-            self._mask = self.polygons_to_mask(self._polygons)
-        return self._mask
-    @property
-    def polygons(self):
-        if self._polygons is None:
-            self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
-        return self._polygons
-    @property
-    def has_holes(self):
-        if self._has_holes is None:
-            if self._mask is not None:
-                self._polygons, self._has_holes = self.mask_to_polygons(self._mask)
-            else:
-                self._has_holes = False  # if original format is polygon, does not have holes
-        return self._has_holes
-    def mask_to_polygons(self, mask):
-        # cv2.RETR_CCOMP flag retrieves all the contours and arranges them to a 2-level
-        # hierarchy. External contours (boundary) of the object are placed in hierarchy-1.
-        # Internal contours (holes) are placed in hierarchy-2.
-        # cv2.CHAIN_APPROX_NONE flag gets vertices of polygons from contours.
-        mask = np.ascontiguousarray(mask)  # some versions of cv2 does not support incontiguous arr
-        res = cv2.findContours(mask.astype("uint8"), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_NONE)
-        hierarchy = res[-1]
-        if hierarchy is None:  # empty mask
-            return [], False
-        has_holes = (hierarchy.reshape(-1, 4)[:, 3] >= 0).sum() > 0
-        res = res[-2]
-        res = [x.flatten() for x in res]
-        # These coordinates from OpenCV are integers in range [0, W-1 or H-1].
-        # We add 0.5 to turn them into real-value coordinate space. A better solution
-        # would be to first +0.5 and then dilate the returned polygon by 0.5.
-        res = [x + 0.5 for x in res if len(x) >= 6]
-        return res, has_holes
-    def polygons_to_mask(self, polygons):
-        rle = mask_util.frPyObjects(polygons, self.height, self.width)
-        rle = mask_util.merge(rle)
-        return mask_util.decode(rle)[:, :]
-    def area(self):
-        return self.mask.sum()
-    def bbox(self):
-        p = mask_util.frPyObjects(self.polygons, self.height, self.width)
-        p = mask_util.merge(p)
-        bbox = mask_util.toBbox(p)
-        bbox[2] += bbox[0]
-        bbox[3] += bbox[1]
-        return bbox
-class _PanopticPrediction:
-    """
-    Unify different panoptic annotation/prediction formats
-    """
-    def __init__(self, panoptic_seg, segments_info, metadata=None):
-        if segments_info is None:
-            assert metadata is not None
-            # If "segments_info" is None, we assume "panoptic_img" is a
-            # H*W int32 image storing the panoptic_id in the format of
-            # category_id * label_divisor + instance_id. We reserve -1 for
-            # VOID label.
-            label_divisor = metadata.label_divisor
-            segments_info = []
-            for panoptic_label in np.unique(panoptic_seg.numpy()):
-                if panoptic_label == -1:
-                    # VOID region.
-                    continue
-                pred_class = panoptic_label // label_divisor
-                isthing = pred_class in metadata.thing_dataset_id_to_contiguous_id.values()
-                segments_info.append(
-                    {
-                        "id": int(panoptic_label),
-                        "category_id": int(pred_class),
-                        "isthing": bool(isthing),
-                    }
-                )
-        del metadata
-        self._seg = panoptic_seg
-        self._sinfo = {s["id"]: s for s in segments_info}  # seg id -> seg info
-        segment_ids, areas = torch.unique(panoptic_seg, sorted=True, return_counts=True)
-        areas = areas.numpy()
-        sorted_idxs = np.argsort(-areas)
-        self._seg_ids, self._seg_areas = segment_ids[sorted_idxs], areas[sorted_idxs]
-        self._seg_ids = self._seg_ids.tolist()
-        for sid, area in zip(self._seg_ids, self._seg_areas):
-            if sid in self._sinfo:
-                self._sinfo[sid]["area"] = float(area)
-    def non_empty_mask(self):
-        """
-        Returns:
-            (H, W) array, a mask for all pixels that have a prediction
-        """
-        empty_ids = []
-        for id in self._seg_ids:
-            if id not in self._sinfo:
-                empty_ids.append(id)
-        if len(empty_ids) == 0:
-            return np.zeros(self._seg.shape, dtype=np.uint8)
-        assert (
-            len(empty_ids) == 1
-        ), ">1 ids corresponds to no labels. This is currently not supported"
-        return (self._seg != empty_ids[0]).numpy().astype(np.bool)
-    def semantic_masks(self):
-        for sid in self._seg_ids:
-            sinfo = self._sinfo.get(sid)
-            if sinfo is None or sinfo["isthing"]:
-                # Some pixels (e.g. id 0 in PanopticFPN) have no instance or semantic predictions.
-                continue
-            yield (self._seg == sid).numpy().astype(np.bool), sinfo
-    def instance_masks(self):
-        for sid in self._seg_ids:
-            sinfo = self._sinfo.get(sid)
-            if sinfo is None or not sinfo["isthing"]:
-                continue
-            mask = (self._seg == sid).numpy().astype(np.bool)
-            if mask.sum() > 0:
-                yield mask, sinfo
-def _create_text_labels(classes, scores, class_names, is_crowd=None):
-    """
-    Args:
-        classes (list[int] or None):
-        scores (list[float] or None):
-        class_names (list[str] or None):
-        is_crowd (list[bool] or None):
-    Returns:
-        list[str] or None
-    """
-    labels = None
-    if classes is not None:
-        if class_names is not None and len(class_names) > 0:
-            labels = [class_names[i] for i in classes]
-        else:
-            labels = [str(i) for i in classes]
-    if scores is not None:
-        if labels is None:
-            labels = ["{:.0f}%".format(s * 100) for s in scores]
-        else:
-            labels = ["{} {:.0f}%".format(l, s * 100) for l, s in zip(labels, scores)]
-    if labels is not None and is_crowd is not None:
-        labels = [l + ("|crowd" if crowd else "") for l, crowd in zip(labels, is_crowd)]
-    return labels
-class VisImage:
-    def __init__(self, img, scale=1.0):
-        """
-        Args:
-            img (ndarray): an RGB image of shape (H, W, 3) in range [0, 255].
-            scale (float): scale the input image
-        """
-        self.img = img
-        self.scale = scale
-        self.width, self.height = img.shape[1], img.shape[0]
-        self._setup_figure(img)
-    def _setup_figure(self, img):
-        """
-        Args:
-            Same as in :meth:`__init__()`.
-        Returns:
-            fig (matplotlib.pyplot.figure): top level container for all the image plot elements.
-            ax (matplotlib.pyplot.Axes): contains figure elements and sets the coordinate system.
-        """
-        fig = mplfigure.Figure(frameon=False)
-        self.dpi = fig.get_dpi()
-        # add a small 1e-2 to avoid precision lost due to matplotlib's truncation
-        # (https://github.com/matplotlib/matplotlib/issues/15363)
-        fig.set_size_inches(
-            (self.width * self.scale + 1e-2) / self.dpi,
-            (self.height * self.scale + 1e-2) / self.dpi,
-        )
-        self.canvas = FigureCanvasAgg(fig)
-        # self.canvas = mpl.backends.backend_cairo.FigureCanvasCairo(fig)
-        ax = fig.add_axes([0.0, 0.0, 1.0, 1.0])
-        ax.axis("off")
-        self.fig = fig
-        self.ax = ax
-        self.reset_image(img)
-    def reset_image(self, img):
-        """
-        Args:
-            img: same as in __init__
-        """
-        img = img.astype("uint8")
-        self.ax.imshow(img, extent=(0, self.width, self.height, 0), interpolation="nearest")
-    def save(self, filepath):
-        """
-        Args:
-            filepath (str): a string that contains the absolute path, including the file name, where
-                the visualized image will be saved.
-        """
-        self.fig.savefig(filepath)
-    def get_image(self):
-        """
-        Returns:
-            ndarray:
-                the visualized image of shape (H, W, 3) (RGB) in uint8 type.
-                The shape is scaled w.r.t the input image using the given `scale` argument.
-        """
-        canvas = self.canvas
-        s, (width, height) = canvas.print_to_buffer()
-        # buf = io.BytesIO()  # works for cairo backend
-        # canvas.print_rgba(buf)
-        # width, height = self.width, self.height
-        # s = buf.getvalue()
-        buffer = np.frombuffer(s, dtype="uint8")
-        img_rgba = buffer.reshape(height, width, 4)
-        rgb, alpha = np.split(img_rgba, [3], axis=2)
-        return rgb.astype("uint8")
-class Visualizer:
-    """
-    Visualizer that draws data about detection/segmentation on images.
-    It contains methods like `draw_{text,box,circle,line,binary_mask,polygon}`
-    that draw primitive objects to images, as well as high-level wrappers like
-    `draw_{instance_predictions,sem_seg,panoptic_seg_predictions,dataset_dict}`
-    that draw composite data in some pre-defined style.
-    Note that the exact visualization style for the high-level wrappers are subject to change.
-    Style such as color, opacity, label contents, visibility of labels, or even the visibility
-    of objects themselves (e.g. when the object is too small) may change according
-    to different heuristics, as long as the results still look visually reasonable.
-    To obtain a consistent style, you can implement custom drawing functions with the
-    abovementioned primitive methods instead. If you need more customized visualization
-    styles, you can process the data yourself following their format documented in
-    tutorials (:doc:`/tutorials/models`, :doc:`/tutorials/datasets`). This class does not
-    intend to satisfy everyone's preference on drawing styles.
-    This visualizer focuses on high rendering quality rather than performance. It is not
-    designed to be used for real-time applications.
-    """
-    # TODO implement a fast, rasterized version using OpenCV
-    def __init__(self, img_rgb, metadata=None, scale=1.0, instance_mode=ColorMode.IMAGE):
-        """
-        Args:
-            img_rgb: a numpy array of shape (H, W, C), where H and W correspond to
-                the height and width of the image respectively. C is the number of
-                color channels. The image is required to be in RGB format since that
-                is a requirement of the Matplotlib library. The image is also expected
-                to be in the range [0, 255].
-            metadata (Metadata): dataset metadata (e.g. class names and colors)
-            instance_mode (ColorMode): defines one of the pre-defined style for drawing
-                instances on an image.
-        """
-        self.img = np.asarray(img_rgb).clip(0, 255).astype(np.uint8)
-        if metadata is None:
-            metadata = MetadataCatalog.get("__nonexist__")
-        self.metadata = metadata
-        self.output = VisImage(self.img, scale=scale)
-        self.cpu_device = torch.device("cpu")
-        # too small texts are useless, therefore clamp to 9
-        self._default_font_size = max(
-            np.sqrt(self.output.height * self.output.width) // 90, 10 // scale
-        )
-        self._default_font_size = 18
-        self._instance_mode = instance_mode
-        self.keypoint_threshold = _KEYPOINT_THRESHOLD
-    def draw_instance_predictions(self, predictions):
-        """
-        Draw instance-level prediction results on an image.
-        Args:
-            predictions (Instances): the output of an instance detection/segmentation
-                model. Following fields will be used to draw:
-                "pred_boxes", "pred_classes", "scores", "pred_masks" (or "pred_masks_rle").
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        boxes = predictions.pred_boxes if predictions.has("pred_boxes") else None
-        scores = predictions.scores if predictions.has("scores") else None
-        classes = predictions.pred_classes.tolist() if predictions.has("pred_classes") else None
-        labels = _create_text_labels(classes, scores, self.metadata.get("thing_classes", None))
-        keypoints = predictions.pred_keypoints if predictions.has("pred_keypoints") else None
-        keep = (scores > 0.8).cpu()
-        boxes = boxes[keep]
-        scores = scores[keep]
-        classes = np.array(classes)
-        classes = classes[np.array(keep)]
-        labels = np.array(labels)
-        labels = labels[np.array(keep)]
-        if predictions.has("pred_masks"):
-            masks = np.asarray(predictions.pred_masks)
-            masks = masks[np.array(keep)]
-            masks = [GenericMask(x, self.output.height, self.output.width) for x in masks]
-        else:
-            masks = None
-        if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
-        # if self.metadata.get("thing_colors"):
-            colors = [
-                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in classes
-            ]
-            alpha = 0.4
-        else:
-            colors = None
-            alpha = 0.4
-        if self._instance_mode == ColorMode.IMAGE_BW:
-            self.output.reset_image(
-                self._create_grayscale_image(
-                    (predictions.pred_masks.any(dim=0) > 0).numpy()
-                    if predictions.has("pred_masks")
-                    else None
-                )
-            )
-            alpha = 0.3
-        self.overlay_instances(
-            masks=masks,
-            boxes=boxes,
-            labels=labels,
-            keypoints=keypoints,
-            assigned_colors=colors,
-            alpha=alpha,
-        )
-        return self.output
-    def draw_sem_seg(self, sem_seg, area_threshold=None, alpha=0.7):
-        """
-        Draw semantic segmentation predictions/labels.
-        Args:
-            sem_seg (Tensor or ndarray): the segmentation of shape (H, W).
-                Each value is the integer label of the pixel.
-            area_threshold (int): segments with less than `area_threshold` are not drawn.
-            alpha (float): the larger it is, the more opaque the segmentations are.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        if isinstance(sem_seg, torch.Tensor):
-            sem_seg = sem_seg.numpy()
-        labels, areas = np.unique(sem_seg, return_counts=True)
-        sorted_idxs = np.argsort(-areas).tolist()
-        labels = labels[sorted_idxs]
-        for label in filter(lambda l: l < len(self.metadata.stuff_classes), labels):
-            try:
-                mask_color = [x / 255 for x in self.metadata.stuff_colors[label]]
-            except (AttributeError, IndexError):
-                mask_color = None
-            binary_mask = (sem_seg == label).astype(np.uint8)
-            text = self.metadata.stuff_classes[label]
-            self.draw_binary_mask(
-                binary_mask,
-                color=mask_color,
-                edge_color=_OFF_WHITE,
-                text=text,
-                alpha=alpha,
-                area_threshold=area_threshold,
-            )
-        return self.output
-    def draw_panoptic_seg(self, panoptic_seg, segments_info, area_threshold=None, alpha=0.7):
-        """
-        Draw panoptic prediction annotations or results.
-        Args:
-            panoptic_seg (Tensor): of shape (height, width) where the values are ids for each
-                segment.
-            segments_info (list[dict] or None): Describe each segment in `panoptic_seg`.
-                If it is a ``list[dict]``, each dict contains keys "id", "category_id".
-                If None, category id of each pixel is computed by
-                ``pixel // metadata.label_divisor``.
-            area_threshold (int): stuff segments with less than `area_threshold` are not drawn.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        pred = _PanopticPrediction(panoptic_seg, segments_info, self.metadata)
-        if self._instance_mode == ColorMode.IMAGE_BW:
-            self.output.reset_image(self._create_grayscale_image(pred.non_empty_mask()))
-        # draw mask for all semantic segments first i.e. "stuff"
-        for mask, sinfo in pred.semantic_masks():
-            category_idx = sinfo["category_id"]
-            try:
-                mask_color = [x / 255 for x in self.metadata.stuff_colors[category_idx]]
-            except AttributeError:
-                mask_color = None
-            text = self.metadata.stuff_classes[category_idx]
-            self.draw_binary_mask(
-                mask,
-                color=mask_color,
-                edge_color=_OFF_WHITE,
-                text=text,
-                alpha=alpha,
-                area_threshold=area_threshold,
-            )
-        # draw mask for all instances second
-        all_instances = list(pred.instance_masks())
-        if len(all_instances) == 0:
-            return self.output
-        masks, sinfo = list(zip(*all_instances))
-        category_ids = [x["category_id"] for x in sinfo]
-        try:
-            scores = [x["score"] for x in sinfo]
-        except KeyError:
-            scores = None
-        labels = _create_text_labels(
-            category_ids, scores, self.metadata.thing_classes, [x.get("iscrowd", 0) for x in sinfo]
-        )
-        try:
-            colors = [
-                self._jitter([x / 255 for x in self.metadata.thing_colors[c]]) for c in category_ids
-            ]
-        except AttributeError:
-            colors = None
-        self.overlay_instances(masks=masks, labels=labels, assigned_colors=colors, alpha=alpha)
-        return self.output
-    draw_panoptic_seg_predictions = draw_panoptic_seg  # backward compatibility
-    def draw_dataset_dict(self, dic):
-        """
-        Draw annotations/segmentaions in Detectron2 Dataset format.
-        Args:
-            dic (dict): annotation/segmentation data of one image, in Detectron2 Dataset format.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        annos = dic.get("annotations", None)
-        if annos:
-            if "segmentation" in annos[0]:
-                masks = [x["segmentation"] for x in annos]
-            else:
-                masks = None
-            if "keypoints" in annos[0]:
-                keypts = [x["keypoints"] for x in annos]
-                keypts = np.array(keypts).reshape(len(annos), -1, 3)
-            else:
-                keypts = None
-            boxes = [
-                BoxMode.convert(x["bbox"], x["bbox_mode"], BoxMode.XYXY_ABS)
-                if len(x["bbox"]) == 4
-                else x["bbox"]
-                for x in annos
-            ]
-            colors = None
-            category_ids = [x["category_id"] for x in annos]
-            if self._instance_mode == ColorMode.SEGMENTATION and self.metadata.get("thing_colors"):
-                colors = [
-                    self._jitter([x / 255 for x in self.metadata.thing_colors[c]])
-                    for c in category_ids
-                ]
-            names = self.metadata.get("thing_classes", None)
-            labels = _create_text_labels(
-                category_ids,
-                scores=None,
-                class_names=names,
-                is_crowd=[x.get("iscrowd", 0) for x in annos],
-            )
-            self.overlay_instances(
-                labels=labels, boxes=boxes, masks=masks, keypoints=keypts, assigned_colors=colors
-            )
-        sem_seg = dic.get("sem_seg", None)
-        if sem_seg is None and "sem_seg_file_name" in dic:
-            with PathManager.open(dic["sem_seg_file_name"], "rb") as f:
-                sem_seg = Image.open(f)
-                sem_seg = np.asarray(sem_seg, dtype="uint8")
-        if sem_seg is not None:
-            self.draw_sem_seg(sem_seg, area_threshold=0, alpha=0.4)
-        pan_seg = dic.get("pan_seg", None)
-        if pan_seg is None and "pan_seg_file_name" in dic:
-            with PathManager.open(dic["pan_seg_file_name"], "rb") as f:
-                pan_seg = Image.open(f)
-                pan_seg = np.asarray(pan_seg)
-                from panopticapi.utils import rgb2id
-                pan_seg = rgb2id(pan_seg)
-        if pan_seg is not None:
-            segments_info = dic["segments_info"]
-            pan_seg = torch.tensor(pan_seg)
-            self.draw_panoptic_seg(pan_seg, segments_info, area_threshold=0, alpha=0.7)
-        return self.output
-    def overlay_instances(
-        self,
-        *,
-        boxes=None,
-        labels=None,
-        masks=None,
-        keypoints=None,
-        assigned_colors=None,
-        alpha=0.5,
-    ):
-        """
-        Args:
-            boxes (Boxes, RotatedBoxes or ndarray): either a :class:`Boxes`,
-                or an Nx4 numpy array of XYXY_ABS format for the N objects in a single image,
-                or a :class:`RotatedBoxes`,
-                or an Nx5 numpy array of (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image,
-            labels (list[str]): the text to be displayed for each instance.
-            masks (masks-like object): Supported types are:
-                * :class:`detectron2.structures.PolygonMasks`,
-                  :class:`detectron2.structures.BitMasks`.
-                * list[list[ndarray]]: contains the segmentation masks for all objects in one image.
-                  The first level of the list corresponds to individual instances. The second
-                  level to all the polygon that compose the instance, and the third level
-                  to the polygon coordinates. The third level should have the format of
-                  [x0, y0, x1, y1, ..., xn, yn] (n >= 3).
-                * list[ndarray]: each ndarray is a binary mask of shape (H, W).
-                * list[dict]: each dict is a COCO-style RLE.
-            keypoints (Keypoint or array like): an array-like object of shape (N, K, 3),
-                where the N is the number of instances and K is the number of keypoints.
-                The last dimension corresponds to (x, y, visibility or score).
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = 0
-        if boxes is not None:
-            boxes = self._convert_boxes(boxes)
-            num_instances = len(boxes)
-        if masks is not None:
-            masks = self._convert_masks(masks)
-            if num_instances:
-                assert len(masks) == num_instances
-            else:
-                num_instances = len(masks)
-        if keypoints is not None:
-            if num_instances:
-                assert len(keypoints) == num_instances
-            else:
-                num_instances = len(keypoints)
-            keypoints = self._convert_keypoints(keypoints)
-        if labels is not None:
-            assert len(labels) == num_instances
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-        if boxes is not None and boxes.shape[1] == 5:
-            return self.overlay_rotated_instances(
-                boxes=boxes, labels=labels, assigned_colors=assigned_colors
-            )
-        # Display in largest to smallest order to reduce occlusion.
-        areas = None
-        if boxes is not None:
-            areas = np.prod(boxes[:, 2:] - boxes[:, :2], axis=1)
-        elif masks is not None:
-            areas = np.asarray([x.area() for x in masks])
-        if areas is not None:
-            sorted_idxs = np.argsort(-areas).tolist()
-            # Re-order overlapped instances in descending order.
-            boxes = boxes[sorted_idxs] if boxes is not None else None
-            labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-            masks = [masks[idx] for idx in sorted_idxs] if masks is not None else None
-            assigned_colors = [assigned_colors[idx] for idx in sorted_idxs]
-            keypoints = keypoints[sorted_idxs] if keypoints is not None else None
-        for i in range(num_instances):
-            color = assigned_colors[i]
-            if boxes is not None:
-                self.draw_box(boxes[i], edge_color=color)
-            if masks is not None:
-                for segment in masks[i].polygons:
-                    self.draw_polygon(segment.reshape(-1, 2), color, alpha=alpha)
-            if labels is not None:
-                # first get a box
-                if boxes is not None:
-                    x0, y0, x1, y1 = boxes[i]
-                    text_pos = (x0, y0)  # if drawing boxes, put text on the box corner.
-                    horiz_align = "left"
-                elif masks is not None:
-                    # skip small mask without polygon
-                    if len(masks[i].polygons) == 0:
-                        continue
-                    x0, y0, x1, y1 = masks[i].bbox()
-                    # draw text in the center (defined by median) when box is not drawn
-                    # median is less sensitive to outliers.
-                    text_pos = np.median(masks[i].mask.nonzero(), axis=1)[::-1]
-                    horiz_align = "center"
-                else:
-                    continue  # drawing the box confidence for keypoints isn't very useful.
-                # for small objects, draw text at the side to avoid occlusion
-                instance_area = (y1 - y0) * (x1 - x0)
-                if (
-                    instance_area < _SMALL_OBJECT_AREA_THRESH * self.output.scale
-                    or y1 - y0 < 40 * self.output.scale
-                ):
-                    if y1 >= self.output.height - 5:
-                        text_pos = (x1, y0)
-                    else:
-                        text_pos = (x0, y1)
-                height_ratio = (y1 - y0) / np.sqrt(self.output.height * self.output.width)
-                lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-                font_size = (
-                    np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2)
-                    * 0.5
-                    * self._default_font_size
-                )
-                self.draw_text(
-                    labels[i],
-                    text_pos,
-                    color=lighter_color,
-                    horizontal_alignment=horiz_align,
-                    font_size=font_size,
-                )
-        # draw keypoints
-        if keypoints is not None:
-            for keypoints_per_instance in keypoints:
-                self.draw_and_connect_keypoints(keypoints_per_instance)
-        return self.output
-    def overlay_rotated_instances(self, boxes=None, labels=None, assigned_colors=None):
-        """
-        Args:
-            boxes (ndarray): an Nx5 numpy array of
-                (x_center, y_center, width, height, angle_degrees) format
-                for the N objects in a single image.
-            labels (list[str]): the text to be displayed for each instance.
-            assigned_colors (list[matplotlib.colors]): a list of colors, where each color
-                corresponds to each mask or box in the image. Refer to 'matplotlib.colors'
-                for full list of formats that the colors are accepted in.
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        num_instances = len(boxes)
-        if assigned_colors is None:
-            assigned_colors = [random_color(rgb=True, maximum=1) for _ in range(num_instances)]
-        if num_instances == 0:
-            return self.output
-        # Display in largest to smallest order to reduce occlusion.
-        if boxes is not None:
-            areas = boxes[:, 2] * boxes[:, 3]
-        sorted_idxs = np.argsort(-areas).tolist()
-        # Re-order overlapped instances in descending order.
-        boxes = boxes[sorted_idxs]
-        labels = [labels[k] for k in sorted_idxs] if labels is not None else None
-        colors = [assigned_colors[idx] for idx in sorted_idxs]
-        for i in range(num_instances):
-            self.draw_rotated_box_with_label(
-                boxes[i], edge_color=colors[i], label=labels[i] if labels is not None else None
-            )
-        return self.output
-    def draw_and_connect_keypoints(self, keypoints):
-        """
-        Draws keypoints of an instance and follows the rules for keypoint connections
-        to draw lines between appropriate keypoints. This follows color heuristics for
-        line color.
-        Args:
-            keypoints (Tensor): a tensor of shape (K, 3), where K is the number of keypoints
-                and the last dimension corresponds to (x, y, probability).
-        Returns:
-            output (VisImage): image object with visualizations.
-        """
-        visible = {}
-        keypoint_names = self.metadata.get("keypoint_names")
-        for idx, keypoint in enumerate(keypoints):
-            # draw keypoint
-            x, y, prob = keypoint
-            if prob > self.keypoint_threshold:
-                self.draw_circle((x, y), color=_RED)
-                if keypoint_names:
-                    keypoint_name = keypoint_names[idx]
-                    visible[keypoint_name] = (x, y)
-        if self.metadata.get("keypoint_connection_rules"):
-            for kp0, kp1, color in self.metadata.keypoint_connection_rules:
-                if kp0 in visible and kp1 in visible:
-                    x0, y0 = visible[kp0]
-                    x1, y1 = visible[kp1]
-                    color = tuple(x / 255.0 for x in color)
-                    self.draw_line([x0, x1], [y0, y1], color=color)
-        # draw lines from nose to mid-shoulder and mid-shoulder to mid-hip
-        # Note that this strategy is specific to person keypoints.
-        # For other keypoints, it should just do nothing
-        try:
-            ls_x, ls_y = visible["left_shoulder"]
-            rs_x, rs_y = visible["right_shoulder"]
-            mid_shoulder_x, mid_shoulder_y = (ls_x + rs_x) / 2, (ls_y + rs_y) / 2
-        except KeyError:
-            pass
-        else:
-            # draw line from nose to mid-shoulder
-            nose_x, nose_y = visible.get("nose", (None, None))
-            if nose_x is not None:
-                self.draw_line([nose_x, mid_shoulder_x], [nose_y, mid_shoulder_y], color=_RED)
-            try:
-                # draw line from mid-shoulder to mid-hip
-                lh_x, lh_y = visible["left_hip"]
-                rh_x, rh_y = visible["right_hip"]
-            except KeyError:
-                pass
-            else:
-                mid_hip_x, mid_hip_y = (lh_x + rh_x) / 2, (lh_y + rh_y) / 2
-                self.draw_line([mid_hip_x, mid_shoulder_x], [mid_hip_y, mid_shoulder_y], color=_RED)
-        return self.output
-    """
-    Primitive drawing functions:
-    """
-    def draw_text(
-        self,
-        text,
-        position,
-        *,
-        font_size=None,
-        color="g",
-        horizontal_alignment="center",
-        rotation=0,
-    ):
-        """
-        Args:
-            text (str): class label
-            position (tuple): a tuple of the x and y coordinates to place text on image.
-            font_size (int, optional): font of the text. If not provided, a font size
-                proportional to the image width is calculated and used.
-            color: color of the text. Refer to `matplotlib.colors` for full list
-                of formats that are accepted.
-            horizontal_alignment (str): see `matplotlib.text.Text`
-            rotation: rotation angle in degrees CCW
-        Returns:
-            output (VisImage): image object with text drawn.
-        """
-        if not font_size:
-            font_size = self._default_font_size
-        # since the text background is dark, we don't want the text to be dark
-        color = np.maximum(list(mplc.to_rgb(color)), 0.2)
-        color[np.argmax(color)] = max(0.8, np.max(color))
-        x, y = position
-        self.output.ax.text(
-            x,
-            y,
-            text,
-            size=font_size * self.output.scale,
-            family="sans-serif",
-            bbox={"facecolor": "black", "alpha": 0.8, "pad": 0.7, "edgecolor": "none"},
-            verticalalignment="top",
-            horizontalalignment=horizontal_alignment,
-            color=color,
-            zorder=10,
-            rotation=rotation,
-        )
-        return self.output
-    def draw_box(self, box_coord, alpha=0.5, edge_color="g", line_style="-"):
-        """
-        Args:
-            box_coord (tuple): a tuple containing x0, y0, x1, y1 coordinates, where x0 and y0
-                are the coordinates of the image's top left corner. x1 and y1 are the
-                coordinates of the image's bottom right corner.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x0, y0, x1, y1 = box_coord
-        width = x1 - x0
-        height = y1 - y0
-        linewidth = max(self._default_font_size / 4, 1)
-        self.output.ax.add_patch(
-            mpl.patches.Rectangle(
-                (x0, y0),
-                width,
-                height,
-                fill=False,
-                edgecolor=edge_color,
-                linewidth=linewidth * self.output.scale,
-                alpha=alpha,
-                linestyle=line_style,
-            )
-        )
-        return self.output
-    def draw_rotated_box_with_label(
-        self, rotated_box, alpha=0.5, edge_color="g", line_style="-", label=None
-    ):
-        """
-        Draw a rotated box with label on its top-left corner.
-        Args:
-            rotated_box (tuple): a tuple containing (cnt_x, cnt_y, w, h, angle),
-                where cnt_x and cnt_y are the center coordinates of the box.
-                w and h are the width and height of the box. angle represents how
-                many degrees the box is rotated CCW with regard to the 0-degree box.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            edge_color: color of the outline of the box. Refer to `matplotlib.colors`
-                for full list of formats that are accepted.
-            line_style (string): the string to use to create the outline of the boxes.
-            label (string): label for rotated box. It will not be rendered when set to None.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        cnt_x, cnt_y, w, h, angle = rotated_box
-        area = w * h
-        # use thinner lines when the box is small
-        linewidth = self._default_font_size / (
-            6 if area < _SMALL_OBJECT_AREA_THRESH * self.output.scale else 3
-        )
-        theta = angle * math.pi / 180.0
-        c = math.cos(theta)
-        s = math.sin(theta)
-        rect = [(-w / 2, h / 2), (-w / 2, -h / 2), (w / 2, -h / 2), (w / 2, h / 2)]
-        # x: left->right ; y: top->down
-        rotated_rect = [(s * yy + c * xx + cnt_x, c * yy - s * xx + cnt_y) for (xx, yy) in rect]
-        for k in range(4):
-            j = (k + 1) % 4
-            self.draw_line(
-                [rotated_rect[k][0], rotated_rect[j][0]],
-                [rotated_rect[k][1], rotated_rect[j][1]],
-                color=edge_color,
-                linestyle="--" if k == 1 else line_style,
-                linewidth=linewidth,
-            )
-        if label is not None:
-            text_pos = rotated_rect[1]  # topleft corner
-            height_ratio = h / np.sqrt(self.output.height * self.output.width)
-            label_color = self._change_color_brightness(edge_color, brightness_factor=0.7)
-            font_size = (
-                np.clip((height_ratio - 0.02) / 0.08 + 1, 1.2, 2) * 0.5 * self._default_font_size
-            )
-            self.draw_text(label, text_pos, color=label_color, font_size=font_size, rotation=angle)
-        return self.output
-    def draw_circle(self, circle_coord, color, radius=3):
-        """
-        Args:
-            circle_coord (list(int) or tuple(int)): contains the x and y coordinates
-                of the center of the circle.
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            radius (int): radius of the circle.
-        Returns:
-            output (VisImage): image object with box drawn.
-        """
-        x, y = circle_coord
-        self.output.ax.add_patch(
-            mpl.patches.Circle(circle_coord, radius=radius, fill=True, color=color)
-        )
-        return self.output
-    def draw_line(self, x_data, y_data, color, linestyle="-", linewidth=None):
-        """
-        Args:
-            x_data (list[int]): a list containing x values of all the points being drawn.
-                Length of list should match the length of y_data.
-            y_data (list[int]): a list containing y values of all the points being drawn.
-                Length of list should match the length of x_data.
-            color: color of the line. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            linestyle: style of the line. Refer to `matplotlib.lines.Line2D`
-                for a full list of formats that are accepted.
-            linewidth (float or None): width of the line. When it's None,
-                a default value will be computed and used.
-        Returns:
-            output (VisImage): image object with line drawn.
-        """
-        if linewidth is None:
-            linewidth = self._default_font_size / 3
-        linewidth = max(linewidth, 1)
-        self.output.ax.add_line(
-            mpl.lines.Line2D(
-                x_data,
-                y_data,
-                linewidth=linewidth * self.output.scale,
-                color=color,
-                linestyle=linestyle,
-            )
-        )
-        return self.output
-    def draw_binary_mask(
-        self, binary_mask, color=None, *, edge_color=None, text=None, alpha=0.7, area_threshold=10
-    ):
-        """
-        Args:
-            binary_mask (ndarray): numpy array of shape (H, W), where H is the image height and
-                W is the image width. Each value in the array is either a 0 or 1 value of uint8
-                type.
-            color: color of the mask. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted. If None, will pick a random color.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted.
-            text (str): if None, will be drawn on the object
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-            area_threshold (float): a connected component smaller than this area will not be shown.
-        Returns:
-            output (VisImage): image object with mask drawn.
-        """
-        if color is None:
-            color = random_color(rgb=True, maximum=1)
-        color = mplc.to_rgb(color)
-        has_valid_segment = False
-        binary_mask = binary_mask.astype("uint8")  # opencv needs uint8
-        mask = GenericMask(binary_mask, self.output.height, self.output.width)
-        shape2d = (binary_mask.shape[0], binary_mask.shape[1])
-        if not mask.has_holes:
-            # draw polygons for regular masks
-            for segment in mask.polygons:
-                area = mask_util.area(mask_util.frPyObjects([segment], shape2d[0], shape2d[1]))
-                if area < (area_threshold or 0):
-                    continue
-                has_valid_segment = True
-                segment = segment.reshape(-1, 2)
-                self.draw_polygon(segment, color=color, edge_color=edge_color, alpha=alpha)
-        else:
-            # TODO: Use Path/PathPatch to draw vector graphics:
-            # https://stackoverflow.com/questions/8919719/how-to-plot-a-complex-polygon
-            rgba = np.zeros(shape2d + (4,), dtype="float32")
-            rgba[:, :, :3] = color
-            rgba[:, :, 3] = (mask.mask == 1).astype("float32") * alpha
-            has_valid_segment = True
-            self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
-        if text is not None and has_valid_segment:
-            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-            self._draw_text_in_mask(binary_mask, text, lighter_color)
-        return self.output
-    def draw_soft_mask(self, soft_mask, color=None, *, text=None, alpha=0.5):
-        """
-        Args:
-            soft_mask (ndarray): float array of shape (H, W), each value in [0, 1].
-            color: color of the mask. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted. If None, will pick a random color.
-            text (str): if None, will be drawn on the object
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-        Returns:
-            output (VisImage): image object with mask drawn.
-        """
-        if color is None:
-            color = random_color(rgb=True, maximum=1)
-        color = mplc.to_rgb(color)
-        shape2d = (soft_mask.shape[0], soft_mask.shape[1])
-        rgba = np.zeros(shape2d + (4,), dtype="float32")
-        rgba[:, :, :3] = color
-        rgba[:, :, 3] = soft_mask * alpha
-        self.output.ax.imshow(rgba, extent=(0, self.output.width, self.output.height, 0))
-        if text is not None:
-            lighter_color = self._change_color_brightness(color, brightness_factor=0.7)
-            binary_mask = (soft_mask > 0.5).astype("uint8")
-            self._draw_text_in_mask(binary_mask, text, lighter_color)
-        return self.output
-    def draw_polygon(self, segment, color, edge_color=None, alpha=0.5):
-        """
-        Args:
-            segment: numpy array of shape Nx2, containing all the points in the polygon.
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            edge_color: color of the polygon edges. Refer to `matplotlib.colors` for a
-                full list of formats that are accepted. If not provided, a darker shade
-                of the polygon color will be used instead.
-            alpha (float): blending efficient. Smaller values lead to more transparent masks.
-        Returns:
-            output (VisImage): image object with polygon drawn.
-        """
-        if edge_color is None:
-            # make edge color darker than the polygon color
-            if alpha > 0.8:
-                edge_color = self._change_color_brightness(color, brightness_factor=-0.7)
-            else:
-                edge_color = color
-        edge_color = mplc.to_rgb(edge_color) + (1,)
-        polygon = mpl.patches.Polygon(
-            segment,
-            fill=True,
-            facecolor=mplc.to_rgb(color) + (alpha,),
-            edgecolor=edge_color,
-            linewidth=max(self._default_font_size // 15 * self.output.scale, 1),
-        )
-        self.output.ax.add_patch(polygon)
-        return self.output
-    """
-    Internal methods:
-    """
-    def _jitter(self, color):
-        """
-        Randomly modifies given color to produce a slightly different color than the color given.
-        Args:
-            color (tuple[double]): a tuple of 3 elements, containing the RGB values of the color
-                picked. The values in the list are in the [0.0, 1.0] range.
-        Returns:
-            jittered_color (tuple[double]): a tuple of 3 elements, containing the RGB values of the
-                color after being jittered. The values in the list are in the [0.0, 1.0] range.
-        """
-        color = mplc.to_rgb(color)
-        # np.random.seed(0)
-        vec = np.random.rand(3)
-        # better to do it in another color space
-        vec = vec / np.linalg.norm(vec) * 0.5
-        res = np.clip(vec + color, 0, 1)
-        return tuple(res)
-    def _create_grayscale_image(self, mask=None):
-        """
-        Create a grayscale version of the original image.
-        The colors in masked area, if given, will be kept.
-        """
-        img_bw = self.img.astype("f4").mean(axis=2)
-        img_bw = np.stack([img_bw] * 3, axis=2)
-        if mask is not None:
-            img_bw[mask] = self.img[mask]
-        return img_bw
-    def _change_color_brightness(self, color, brightness_factor):
-        """
-        Depending on the brightness_factor, gives a lighter or darker color i.e. a color with
-        less or more saturation than the original color.
-        Args:
-            color: color of the polygon. Refer to `matplotlib.colors` for a full list of
-                formats that are accepted.
-            brightness_factor (float): a value in [-1.0, 1.0] range. A lightness factor of
-                0 will correspond to no change, a factor in [-1.0, 0) range will result in
-                a darker color and a factor in (0, 1.0] range will result in a lighter color.
-        Returns:
-            modified_color (tuple[double]): a tuple containing the RGB values of the
-                modified color. Each value in the tuple is in the [0.0, 1.0] range.
-        """
-        assert brightness_factor >= -1.0 and brightness_factor <= 1.0
-        color = mplc.to_rgb(color)
-        polygon_color = colorsys.rgb_to_hls(*mplc.to_rgb(color))
-        modified_lightness = polygon_color[1] + (brightness_factor * polygon_color[1])
-        modified_lightness = 0.0 if modified_lightness < 0.0 else modified_lightness
-        modified_lightness = 1.0 if modified_lightness > 1.0 else modified_lightness
-        modified_color = colorsys.hls_to_rgb(polygon_color[0], modified_lightness, polygon_color[2])
-        return modified_color
-    def _convert_boxes(self, boxes):
-        """
-        Convert different format of boxes to an NxB array, where B = 4 or 5 is the box dimension.
-        """
-        if isinstance(boxes, Boxes) or isinstance(boxes, RotatedBoxes):
-            return boxes.tensor.detach().numpy()
-        else:
-            return np.asarray(boxes)
-    def _convert_masks(self, masks_or_polygons):
-        """
-        Convert different format of masks or polygons to a tuple of masks and polygons.
-        Returns:
-            list[GenericMask]:
-        """
-        m = masks_or_polygons
-        if isinstance(m, PolygonMasks):
-            m = m.polygons
-        if isinstance(m, BitMasks):
-            m = m.tensor.numpy()
-        if isinstance(m, torch.Tensor):
-            m = m.numpy()
-        ret = []
-        for x in m:
-            if isinstance(x, GenericMask):
-                ret.append(x)
-            else:
-                ret.append(GenericMask(x, self.output.height, self.output.width))
-        return ret
-    def _draw_text_in_mask(self, binary_mask, text, color):
-        """
-        Find proper places to draw text given a binary mask.
-        """
-        # TODO sometimes drawn on wrong objects. the heuristics here can improve.
-        _num_cc, cc_labels, stats, centroids = cv2.connectedComponentsWithStats(binary_mask, 8)
-        if stats[1:, -1].size == 0:
-            return
-        largest_component_id = np.argmax(stats[1:, -1]) + 1
-        # draw text on the largest component, as well as other very large components.
-        for cid in range(1, _num_cc):
-            if cid == largest_component_id or stats[cid, -1] > _LARGE_MASK_AREA_THRESH:
-                # median is more stable than centroid
-                # center = centroids[largest_component_id]
-                center = np.median((cc_labels == cid).nonzero(), axis=1)[::-1]
-                self.draw_text(text, center, color=color)
-    def _convert_keypoints(self, keypoints):
-        if isinstance(keypoints, Keypoints):
-            keypoints = keypoints.tensor
-        keypoints = np.asarray(keypoints)
-        return keypoints
-    def get_output(self):
-        """
-        Returns:
-            output (VisImage): the image output containing the visualizations added
-            to the image.
-        """
-        return self.output