Spaces:

Purple11
/

Grounded-Diffusion

Runtime error

App Files Files Community

Purple11 commited on Feb 17, 2023

Commit

0d2623f

1 Parent(s): 3d41704

Update app.py

Browse files

Files changed (1) hide show

app.py +414 -58

app.py CHANGED Viewed

@@ -4,75 +4,431 @@ import torch
 import numpy as np
 from torchvision import transforms
-title = "Remove Bg"
-description = "Automatically remove the image background from a profile photo."
-article = "<p style='text-align: center'><a href='https://news.machinelearning.sg/posts/beautiful_profile_pics_remove_background_image_with_deeplabv3/'>Blog</a> | <a href='https://github.com/eugenesiow/practical-ml'>Github Repo</a></p>"
-def make_transparent_foreground(pic, mask):
-    # split the image into channels
-    b, g, r = cv2.split(np.array(pic).astype('uint8'))
-    # add an alpha channel with and fill all with transparent pixels (max 255)
-    a = np.ones(mask.shape, dtype='uint8') * 255
-    # merge the alpha channel back
-    alpha_im = cv2.merge([b, g, r, a], 4)
-    # create a transparent background
-    bg = np.zeros(alpha_im.shape)
-    # setup the new mask
-    new_mask = np.stack([mask, mask, mask, mask], axis=2)
-    # copy only the foreground color pixels from the original image where mask is set
-    foreground = np.where(new_mask, alpha_im, bg).astype(np.uint8)
-    return foreground
-def remove_background(input_image):
-    preprocess = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-    ])
-    input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(0)  # create a mini-batch as expected by the model
-    # move the input and model to GPU for speed if available
-    if torch.cuda.is_available():
-        input_batch = input_batch.to('cuda')
-        model.to('cuda')
     with torch.no_grad():
-        output = model(input_batch)['out'][0]
-    output_predictions = output.argmax(0)
-    # create a binary (black and white) mask of the profile foreground
-    mask = output_predictions.byte().cpu().numpy()
-    background = np.zeros(mask.shape)
-    bin_mask = np.where(mask, 255, background).astype(np.uint8)
-    foreground = make_transparent_foreground(input_image, bin_mask)
-    return foreground, bin_mask
-def inference(img):
-    foreground, _ = remove_background(img)
-    return foreground
-torch.hub.download_url_to_file('https://pbs.twimg.com/profile_images/691700243809718272/z7XZUARB_400x400.jpg',
-                               'demis.jpg')
-torch.hub.download_url_to_file('https://hai.stanford.edu/sites/default/files/styles/person_medium/public/2020-03/hai_1512feifei.png?itok=INFuLABp',
-                               'lifeifei.png')
-model = torch.hub.load('pytorch/vision:v0.6.0', 'deeplabv3_resnet101', pretrained=True)
-model.eval()
 gr.Interface(
     inference,
-    gr.inputs.Image(type="pil", label="Input"),
     gr.outputs.Image(type="pil", label="Output"),
-    title=title,
-    description=description,
-    article=article,
-    examples=[['demis.jpg'], ['lifeifei.png']],
-    enable_queue=True
 ).launch(debug=False)

 import numpy as np
 from torchvision import transforms
+# title = "Remove Bg"
+# description = "Automatically remove the image background from a profile photo."
+# article = "<p style='text-align: center'><a href='https://news.machinelearning.sg/posts/beautiful_profile_pics_remove_background_image_with_deeplabv3/'>Blog</a> | <a href='https://github.com/eugenesiow/practical-ml'>Github Repo</a></p>"
+import argparse, os
+import cv2
+import torch
+import numpy as np
+import torchvision
+from omegaconf import OmegaConf
+from PIL import Image
+from tqdm import tqdm, trange
+from itertools import islice
+from einops import rearrange
+from torchvision.utils import make_grid
+import time
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import nullcontext
+from ldm.util import instantiate_from_config
+from ldm.models.diffusion.ddim import DDIMSampler
+from ldm.modules.diffusionmodules.openaimodel import clear_feature_dic,get_feature_dic
+from ldm.models.seg_module import Segmodule
+import numpy as np
+os.environ["CUDA_VISIBLE_DEVICES"] = "1"
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+def numpy_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+def load_model_from_config(config, ckpt, verbose=False):
+    # print(f"Loading model from {ckpt}")
+    pl_sd = torch.load(ckpt, map_location="cpu")
+    if "global_step" in pl_sd:
+        # print(f"Global Step: {pl_sd['global_step']}")
+    sd = pl_sd["state_dict"]
+    model = instantiate_from_config(config.model)
+    # m, u = model.load_state_dict(sd, strict=False)
+    # if len(m) > 0 and verbose:
+        # print("missing keys:")
+        # print(m)
+    # if len(u) > 0 and verbose:
+        # print("unexpected keys:")
+        # print(u)
+    model.cuda()
+    model.eval()
+    return model
+def put_watermark(img, wm_encoder=None):
+    if wm_encoder is not None:
+        img = cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
+        img = wm_encoder.encode(img, 'dwtDct')
+        img = Image.fromarray(img[:, :, ::-1])
+    return img
+def load_replacement(x):
+    try:
+        hwc = x.shape
+        y = Image.open("assets/rick.jpeg").convert("RGB").resize((hwc[1], hwc[0]))
+        y = (np.array(y)/255.0).astype(x.dtype)
+        assert y.shape == x.shape
+        return y
+    except Exception:
+        return x
+def plot_mask(img, masks, colors=None, alpha=0.8,indexlist=[0,1]) -> np.ndarray:
+    H,W= masks.shape[0],masks.shape[1]
+    color_list=[[255,97,0],[128,42,42],[220,220,220],[255,153,18],[56,94,15],[127,255,212],[210,180,140],[221,160,221],[255,0,0],[255,128,0],[255,255,0],[128,255,0],[0,255,0],[0,255,128],[0,255,255],[0,128,255],[0,0,255],[128,0,255],[255,0,255],[255,0,128]]*6
+    final_color_list=[np.array([[i]*512]*512) for i in color_list]
+    background=np.ones(img.shape)*255
+    count=0
+    colors=final_color_list[indexlist[count]]
+    for mask, color in zip(masks, colors):
+        color=final_color_list[indexlist[count]]
+        mask = np.stack([mask, mask, mask], -1)
+        img = np.where(mask, img * (1 - alpha) + color * alpha,background*0.4+img*0.6 )
+        count+=1
+    return img.astype(np.uint8)
+def create_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--prompt",
+        type=str,
+        nargs="?",
+        default="a photo of a lion on a mountain top at sunset",
+        help="the prompt to render"
+    )
+    parser.add_argument(
+        "--category",
+        type=str,
+        nargs="?",
+        default="lion",
+        help="the category to ground"
+    )
+    parser.add_argument(
+        "--outdir",
+        type=str,
+        nargs="?",
+        help="dir to write results to",
+        default="outputs/txt2img-samples"
+    )
+    parser.add_argument(
+        "--skip_grid",
+        action='store_true',
+        help="do not save a grid, only individual samples. Helpful when evaluating lots of samples",
+    )
+    parser.add_argument(
+        "--skip_save",
+        action='store_true',
+        help="do not save individual samples. For speed measurements.",
+    )
+    parser.add_argument(
+        "--ddim_steps",
+        type=int,
+        default=50,
+        help="number of ddim sampling steps",
+    )
+    parser.add_argument(
+        "--plms",
+        action='store_true',
+        help="use plms sampling",
+    )
+    parser.add_argument(
+        "--laion400m",
+        action='store_true',
+        help="uses the LAION400M model",
+    )
+    parser.add_argument(
+        "--fixed_code",
+        action='store_true',
+        help="if enabled, uses the same starting code across samples ",
+    )
+    parser.add_argument(
+        "--ddim_eta",
+        type=float,
+        default=0.0,
+        help="ddim eta (eta=0.0 corresponds to deterministic sampling",
+    )
+    parser.add_argument(
+        "--n_iter",
+        type=int,
+        default=1,
+        help="sample this often",
+    )
+    parser.add_argument(
+        "--H",
+        type=int,
+        default=512,
+        help="image height, in pixel space",
+    )
+    parser.add_argument(
+        "--W",
+        type=int,
+        default=512,
+        help="image width, in pixel space",
+    )
+    parser.add_argument(
+        "--C",
+        type=int,
+        default=4,
+        help="latent channels",
+    )
+    parser.add_argument(
+        "--f",
+        type=int,
+        default=8,
+        help="downsampling factor",
+    )
+    parser.add_argument(
+        "--n_samples",
+        type=int,
+        default=1,
+        help="how many samples to produce for each given prompt. A.k.a. batch size",
+    )
+    parser.add_argument(
+        "--n_rows",
+        type=int,
+        default=0,
+        help="rows in the grid (default: n_samples)",
+    )
+    parser.add_argument(
+        "--scale",
+        type=float,
+        default=7.5,
+        help="unconditional guidance scale: eps = eps(x, empty) + scale * (eps(x, cond) - eps(x, empty))",
+    )
+    parser.add_argument(
+        "--from-file",
+        type=str,
+        help="if specified, load prompts from this file",
+    )
+    parser.add_argument(
+        "--config",
+        type=str,
+        default="configs/stable-diffusion/v1-inference.yaml",
+        help="path to config which constructs model",
+    )
+    parser.add_argument(
+        "--sd_ckpt",
+        type=str,
+        default="stable_diffusion.ckpt",
+        help="path to checkpoint of stable diffusion model",
+    )
+    parser.add_argument(
+        "--grounding_ckpt",
+        type=str,
+        default="grounding_module.pth",
+        help="path to checkpoint of grounding module",
+    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=42,
+        help="the seed (for reproducible sampling)",
+    )
+    parser.add_argument(
+        "--precision",
+        type=str,
+        help="evaluate at this precision",
+        choices=["full", "autocast"],
+        default="autocast"
+    )
+    opt = parser.parse_args()
+    return opt
+def inference(input_prompt, input_category):
+    opt = create_parser()
+    seed_everything(opt.seed)
+    tic = time.time()
+    config = OmegaConf.load(f"{opt.config}")
+    model = load_model_from_config(config, f"{opt.sd_ckpt}")
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    model = model.to(device)
+    toc = time.time()
+    seg_module=Segmodule().to(device)
+    seg_module.load_state_dict(torch.load(opt.grounding_ckpt, map_location="cpu"), strict=True)
+    # print('load time:',toc-tic)
+    sampler = DDIMSampler(model)
+    os.makedirs(opt.outdir, exist_ok=True)
+    outpath = opt.outdir
+    batch_size = opt.n_samples
+    precision_scope = autocast if opt.precision=="autocast" else nullcontext
     with torch.no_grad():
+        with precision_scope("cuda"):
+            with model.ema_scope():
+                prompt = input_prompt
+                text = input_category
+                trainclass = text
+                if not opt.from_file:
+                    assert prompt is not None
+                    data = [batch_size * [prompt]]
+                else:
+                    # print(f"reading prompts from {opt.from_file}")
+                    with open(opt.from_file, "r") as f:
+                        data = f.read().splitlines()
+                        data = list(chunk(data, batch_size))
+                sample_path = os.path.join(outpath, "samples")
+                os.makedirs(sample_path, exist_ok=True)
+                start_code = None
+                if opt.fixed_code:
+                    # print('start_code')
+                    start_code = torch.randn([opt.n_samples, opt.C, opt.H // opt.f, opt.W // opt.f], device=device)
+                for n in trange(opt.n_iter, desc="Sampling"):
+                    for prompts in tqdm(data, desc="data"):
+                        clear_feature_dic()
+                        uc = None
+                        if opt.scale != 1.0:
+                            uc = model.get_learned_conditioning(batch_size * [""])
+                        if isinstance(prompts, tuple):
+                            prompts = list(prompts)
+                        c = model.get_learned_conditioning(prompts)
+                        shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
+                        samples_ddim, _, _ = sampler.sample(S=opt.ddim_steps,
+                                                        conditioning=c,
+                                                        batch_size=opt.n_samples,
+                                                        shape=shape,
+                                                        verbose=False,
+                                                        unconditional_guidance_scale=opt.scale,
+                                                        unconditional_conditioning=uc,
+                                                        eta=opt.ddim_eta,
+                                                        x_T=start_code)
+                        x_samples_ddim = model.decode_first_stage(samples_ddim)
+                        diffusion_features = get_feature_dic()
+                        x_sample = torch.clamp((x_samples_ddim[0] + 1.0) / 2.0, min=0.0, max=1.0)
+                        x_sample = 255. * rearrange(x_sample.cpu().numpy(), 'c h w -> h w c')
+                        Image.fromarray(x_sample.astype(np.uint8)).save("demo/demo.png")
+                        img = x_sample.astype(np.uint8)
+                        class_name = trainclass
+                        query_text ="a photograph of a " + class_name
+                        c_split = model.cond_stage_model.tokenizer.tokenize(query_text)
+                        sen_text_embedding = model.get_learned_conditioning(query_text)
+                        class_embedding = sen_text_embedding[:, 5:len(c_split)+1, :]
+                        if class_embedding.size()[1] > 1:
+                            class_embedding = torch.unsqueeze(class_embedding.mean(1), 1)
+                        text_embedding = class_embedding
+                        text_embedding = text_embedding.repeat(batch_size, 1, 1)
+                        pred_seg_total = seg_module(diffusion_features, text_embedding)
+                        pred_seg = torch.unsqueeze(pred_seg_total[0,0,:,:], 0).unsqueeze(0)
+                        label_pred_prob = torch.sigmoid(pred_seg)
+                        label_pred_mask = torch.zeros_like(label_pred_prob, dtype=torch.float32)
+                        label_pred_mask[label_pred_prob > 0.5] = 1
+                        annotation_pred = label_pred_mask[0][0].cpu()
+                        mask = annotation_pred.numpy()
+                        mask = np.expand_dims(mask, 0)
+                        done_image_mask = plot_mask(img, mask, alpha=0.9, indexlist=[0])
+                        # cv2.imwrite(os.path.join("demo/demo_mask.png"), done_image_mask)
+                        # torchvision.utils.save_image(annotation_pred, os.path.join("demo/demo_segresult.png"), normalize=True, scale_each=True)
+                        return x_sample, done_image_mask
+# def make_transparent_foreground(pic, mask):
+#     # split the image into channels
+#     b, g, r = cv2.split(np.array(pic).astype('uint8'))
+#     # add an alpha channel with and fill all with transparent pixels (max 255)
+#     a = np.ones(mask.shape, dtype='uint8') * 255
+#     # merge the alpha channel back
+#     alpha_im = cv2.merge([b, g, r, a], 4)
+#     # create a transparent background
+#     bg = np.zeros(alpha_im.shape)
+#     # setup the new mask
+#     new_mask = np.stack([mask, mask, mask, mask], axis=2)
+#     # copy only the foreground color pixels from the original image where mask is set
+#     foreground = np.where(new_mask, alpha_im, bg).astype(np.uint8)
+#     return foreground
+# def remove_background(input_image):
+#     preprocess = transforms.Compose([
+#         transforms.ToTensor(),
+#         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+#     ])
+#     input_tensor = preprocess(input_image)
+#     input_batch = input_tensor.unsqueeze(0)  # create a mini-batch as expected by the model
+#     # move the input and model to GPU for speed if available
+#     if torch.cuda.is_available():
+#         input_batch = input_batch.to('cuda')
+#         model.to('cuda')
+#     with torch.no_grad():
+#         output = model(input_batch)['out'][0]
+#     output_predictions = output.argmax(0)
+#     # create a binary (black and white) mask of the profile foreground
+#     mask = output_predictions.byte().cpu().numpy()
+#     background = np.zeros(mask.shape)
+#     bin_mask = np.where(mask, 255, background).astype(np.uint8)
+#     foreground = make_transparent_foreground(input_image, bin_mask)
+#     return foreground, bin_mask
+# def inference(img):
+#     foreground, _ = remove_background(img)
+#     return foreground
+# torch.hub.download_url_to_file('https://pbs.twimg.com/profile_images/691700243809718272/z7XZUARB_400x400.jpg',
+#                                'demis.jpg')
+# torch.hub.download_url_to_file('https://hai.stanford.edu/sites/default/files/styles/person_medium/public/2020-03/hai_1512feifei.png?itok=INFuLABp',
+#                                'lifeifei.png')
+# model = torch.hub.load('pytorch/vision:v0.6.0', 'deeplabv3_resnet101', pretrained=True)
+# model.eval()
 gr.Interface(
     inference,
+    gr.inputs.Textbox(label='Prompt', default='a photo of a lion on a mountain top at sunset'),
+    gr.inputs.Textbox(label='category', default='lion'),
     gr.outputs.Image(type="pil", label="Output"),
+    # title=title,
+    # description=description,
+    # article=article,
+    # examples=[['demis.jpg'], ['lifeifei.png']],
+    # enable_queue=True
 ).launch(debug=False)