import os import numpy as np import cv2 import torch from torch import nn from torch.nn import functional as F from torch.utils import data from torchvision import transforms, utils from tqdm import tqdm torch.backends.cudnn.benchmark = True import copy from util import * from PIL import Image from model import * import moviepy.video.io.ImageSequenceClip import scipy import kornia.augmentation as K from base64 import b64encode import gradio as gr from torchvision import transforms # torch.hub.download_url_to_file('https://i.imgur.com/HiOTPNg.png', 'mona.png') # torch.hub.download_url_to_file('https://i.imgur.com/Cw8HcTN.png', 'painting.png') device = 'cpu' latent_dim = 8 n_mlp = 5 num_down = 3 G_A2B = Generator(256, 4, latent_dim, n_mlp, channel_multiplier=1, lr_mlp=.01,n_res=1).to(device).eval() ensure_checkpoint_exists('GNR_checkpoint_full.pt') ckpt = torch.load('GNR_checkpoint_full.pt', map_location=device) G_A2B.load_state_dict(ckpt['G_A2B_ema']) # mean latent truncation = 1 with torch.no_grad(): mean_style = G_A2B.mapping(torch.randn([1000, latent_dim]).to(device)).mean(0, keepdim=True) test_transform = transforms.Compose([ transforms.Resize((256, 256)), transforms.ToTensor(), transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), inplace=True) ]) plt.rcParams['figure.dpi'] = 200 # torch.manual_seed(84986) num_styles = 1 style = torch.randn([num_styles, latent_dim]).to(device) def inference(input_im): if input_im == None: return real_A = test_transform(input_im).unsqueeze(0).to(device) with torch.no_grad(): A2B_content, _ = G_A2B.encode(real_A) #fake_A2B = G_A2B.decode(A2B_content.repeat(num_styles,1,1,1), style) fake_A2B = G_A2B.decode(A2B_content.repeat(num_styles,1,1,1), torch.randn([num_styles, latent_dim]).to(device)) std=(0.5, 0.5, 0.5) mean=(0.5, 0.5, 0.5) z = fake_A2B * torch.tensor(std).view(3, 1, 1) z = z + torch.tensor(mean).view(3, 1, 1) tensor_to_pil = transforms.ToPILImage(mode='RGB')(z.squeeze()) return tensor_to_pil def clear(image): return def setsample(image): return image # with gr.Blocks() as demo: # gr.Markdown("
GANs N' Roses (GNR) is an image-to-image framework for face images that uses a multi-modal approach with novel definitions for content and style. Content is defined as what changes when a augmentations are applied to a face image. Style is defined as what does not change when augmentations are applied to a face image.
GNR's implementation borrows heavily from StyleGAN2; however, adversarial loss is derived from the introduced content and style definitions, ensuring diversity of outputs when repeatedly transforming the same input face image.
The current implementation was trained on the selfie2anime dataset and transforms real human faces into anime faces. Due to limitations of the dataset, GNR works best when working with female face inputs that are cropped to include only the face (no neck and body).
GNR was implemented by Chong, M. & Forsyth, D. (2021) in the paper GANs N' Roses: Stable, Controllable, Diverse Image to Image Translation (works for videos too!)
""" article = """GANs N' Roses (GNR) is an image-to-image framework for face images that uses a multimodal approach with novel definitions for content and style. Content is defined as what changes when a augmentations are applied to a face image. Style is defined as what does not change when augmentations are applied to a face image. The backbone learns these two things separately and uses that information to generate images.
GNR creates images through the use of what's called a Generative Adversarial Network (GAN). To explain what a GAN is, imagine a situation where Tom is learning to draw an apple. Tom knows nothing about an apple so he scribbles a random shape and calls it an apple. He asks his friend Jerry if he got it correctly and naturally Jerry said no. Tom reflects on his drawing and scribbles a new "apple", showing it to Jerry each time. Eventually, Tom gets lucky and draws something close to an apple and fools Jerry. Tom picks up on what features that drawing has, creating more drawings similar to it. He eventually gets better and better but Jerry doesn't like getting fooled so he learns how to tell apart Tom's fake apples better. At this point, it becomes a cat-and-mouse game where both keep learning new things in order to outwit each other. This is the general idea behind GAN's. In more fomal terms, GAN's function using 2 neural networks: the generator and the discriminator. The former would be Tom and the latter would be Jerry.
GNR's implementation borrows heavily from an existing system called StyleGAN2. The main difference is that adversarial loss is derived from the introduced content and style definitions, ensuring diversity of outputs when repeatedly transforming the same input face image.
The current implementation was trained on the selfie2anime dataset and transforms real human faces into anime faces. Due to limitations of the dataset, GNR works best when working with female face inputs that are cropped to include only the face (no neck and body).
GNR was implemented by Chong, M. & Forsyth, D. (2021) in the paper GANs N' Roses: Stable, Controllable, Diverse Image to Image Translation (works for videos too!)
""" gr.Interface( inference, [gr.inputs.Image(type="pil", label="Input")], gr.outputs.Image(type="pil", label="Output"), title=title, description=description, article=article, allow_flagging='never', examples = [["sample_images/2.jpg"],["sample_images/1.JPG"],["sample_images/3.jpg"]] ).launch(share=True) # demo.launch(share = True)