File size: 9,828 Bytes
95ec8d7
 
19a9112
95ec8d7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19a9112
 
95ec8d7
 
 
 
 
 
 
 
 
 
 
 
 
19a9112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1d8b8c4
c39c80d
 
19a9112
 
 
 
 
 
 
 
9e14c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19a9112
95ec8d7
 
19a9112
95ec8d7
19a9112
95ec8d7
19a9112
 
0e173d8
19a9112
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
import os
import numpy as np
import cv2
import torch
from torch import nn
from torch.nn import functional as F
from torch.utils import data
from torchvision import transforms, utils
from tqdm import tqdm
torch.backends.cudnn.benchmark = True
import copy
from util import *
from PIL import Image

from model import *
import moviepy.video.io.ImageSequenceClip
import scipy
import kornia.augmentation as K

from base64 import b64encode
import gradio as gr
from torchvision import transforms

# torch.hub.download_url_to_file('https://i.imgur.com/HiOTPNg.png', 'mona.png')
# torch.hub.download_url_to_file('https://i.imgur.com/Cw8HcTN.png', 'painting.png')

device = 'cpu'
latent_dim = 8
n_mlp = 5
num_down = 3

G_A2B = Generator(256, 4, latent_dim, n_mlp, channel_multiplier=1, lr_mlp=.01,n_res=1).to(device).eval()

ensure_checkpoint_exists('GNR_checkpoint_full.pt')
ckpt = torch.load('GNR_checkpoint_full.pt', map_location=device)

G_A2B.load_state_dict(ckpt['G_A2B_ema'])

# mean latent
truncation = 1
with torch.no_grad():
    mean_style = G_A2B.mapping(torch.randn([1000, latent_dim]).to(device)).mean(0, keepdim=True)


test_transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5), inplace=True)
])
plt.rcParams['figure.dpi'] = 200

# torch.manual_seed(84986)

num_styles = 1
style = torch.randn([num_styles, latent_dim]).to(device)


def inference(input_im):
    if input_im == None:
        return
    real_A = test_transform(input_im).unsqueeze(0).to(device)

    with torch.no_grad():
        A2B_content, _ = G_A2B.encode(real_A)
        #fake_A2B = G_A2B.decode(A2B_content.repeat(num_styles,1,1,1), style)
        fake_A2B = G_A2B.decode(A2B_content.repeat(num_styles,1,1,1), torch.randn([num_styles, latent_dim]).to(device))
        std=(0.5, 0.5, 0.5)
        mean=(0.5, 0.5, 0.5)
        z = fake_A2B * torch.tensor(std).view(3, 1, 1)
        z = z + torch.tensor(mean).view(3, 1, 1)
        tensor_to_pil = transforms.ToPILImage(mode='RGB')(z.squeeze())
    return tensor_to_pil

def clear(image):
    return

def setsample(image):
    return image


# with gr.Blocks() as demo:
#     gr.Markdown("<h1>GANs N' Roses</h1>")
#     gr.Markdown("""Convert real-life face images into diverse anime versions of themselves. Use the default sample image or replace the input
#                 by first clicking X then dragging a new image into the Input box. Crop the image by cliking the pen tool. Click <b>Run</b> to transform the input
#                 into an anime version. Click <b>Clear</b> to clear the ouput box. Try running it multiple times for different anime styles!""")
    
#     with gr.Row():
#         with gr.Column():
#             inp = gr.Image(type="pil", value ="", label="Input")
#             with gr.Row():
#                 clr = gr.Button("Clear") #needs implementation
#                 run = gr.Button("Run")
#         with gr.Column():
#             out = gr.outputs.Image(type="pil")
#         clr.click(fn=clear, inputs=inp, outputs=inp) # clear input gr.Image
#         clr.click(fn=clear, inputs=out, outputs=out) # clear output gr.Image

          
#     gr.Markdown("<h3>Sample Inputs</h3>")
   
#     # with gr.Row():
#             # with gr.Column():
#                 # sample1 = gr.Image(value="sample_images/1.JPG")
#             # with gr.Column():
#                 # samplebtn1 = gr.Button(value="Try sample 1")
#                 # samplebtn1.click(fn=setsample, inputs=sample1, outputs=inp)

#             # with gr.Column():
#                 # sample2 = gr.Image(value="sample_images/2.JPG")
#             # with gr.Column():
#                 # samplebtn2 = gr.Button(value="Try sample 2")
#                 # samplebtn2.click(fn=setsample, inputs=sample2, outputs=inp)    

#             # with gr.Column():
#                 # sample3 = gr.Image(value="sample_images/3.JPG")
#             # with gr.Column():
#                 # samplebtn3 = gr.Button(value="Try sample 3")
#                 # samplebtn3.click(fn=setsample, inputs=sample3, outputs=inp)    
        
#     #add info here
#     gr.Markdown("""
#                 GANs N' Roses (GNR) is an image-to-image framework for face images that uses a multimodal approach with novel definitions for content and style.
#                 <b>Content</b> is defined as what changes when a augmentations are applied to a face image. <b>Style</b> is defined as what does not change when augmentations
#                 are applied to a face image.

#                 GNR's implementation borrows heavily from StyleGAN2; however, adversarial loss is derived from the introduced content and style definitions, ensuring diversity of
#                 outputs when repeatedly transforming the same input face image.

#                 The current implementation was trained on the selfie2anime dataset and transforms real human faces into anime faces. Due to limitations of the dataset, GNR works best
#                 when working with <b>female face inputs</b> that are <b>cropped to include only the face</b> (no neck and body).

#                 GNR was implemented by Chong, M. & Forsyth, D. (2021) in the paper GANs N' Roses: Stable, Controllable, Diverse Image to Image Translation (works for videos too!)
#                 """)

    
#     run.click(fn=inference, inputs=inp, outputs=out)
title = "GANs N' Roses"
description = """Convert real-life face images into diverse anime versions of themselves. Use the default sample image or replace the input
                by first clicking X then dragging a new image into the Input box. Crop the image by clicking the pen tool. Click <b>Submit</b> to transform the input
                into an anime version. Click <b>Clear</b> to clear the output box. Try running it multiple times for different anime styles!"""
article = """<p>GANs N' Roses (GNR) is an image-to-image framework for face images that uses a multi-modal approach with novel definitions for content and style.
                 <b>Content</b> is defined as what changes when a augmentations are applied to a face image. <b>Style</b> is defined as what does not change when augmentations
                 are applied to a face image.</p>
                 <p>GNR's implementation borrows heavily from StyleGAN2; however, adversarial loss is derived from the introduced content and style definitions, ensuring diversity of
                 outputs when repeatedly transforming the same input face image. </p>
                 <p>The current implementation was trained on the selfie2anime dataset and transforms real human faces into anime faces. Due to limitations of the dataset, GNR works best
                 when working with <b>female face inputs</b> that are <b>cropped to include only the face</b> (no neck and body).</p>
                 <p>GNR was implemented by Chong, M. & Forsyth, D. (2021) in the paper GANs N' Roses: Stable, Controllable, Diverse Image to Image Translation (works for videos too!)</p>
                 """
article = """<h1>
  What is GANs N's Roses
</h1>
<p>GANs N' Roses (GNR) is an image-to-image framework for face images that uses a multimodal approach with novel definitions for content and style.
                 <b>Content</b> is defined as what changes when a augmentations are applied to a face image. <b>Style</b> is defined as what does not change when augmentations
                 are applied to a face image. The backbone learns these two things separately and uses that information to generate images.</p> 
<h1>
  How does it work?
</h1>
<p>
  GNR creates images through the use of what's called a Generative Adversarial Network (GAN). To explain what a GAN is, imagine a situation where Tom is learning to draw an apple. Tom knows nothing about an apple so he scribbles a random shape and calls it an apple. He asks his friend Jerry if he got it correctly and naturally Jerry said no. Tom reflects on his drawing and scribbles a new "apple", showing it to Jerry each time. Eventually, Tom gets lucky and draws something close to an apple and fools Jerry. Tom picks up on what features that drawing has, creating more drawings similar to it. He eventually gets better and better but Jerry doesn't like getting fooled so he learns how to tell apart Tom's fake apples better. At this point, it becomes a cat-and-mouse game where both keep learning new things in order to outwit each other. This is the general idea behind GAN's. In more fomal terms, GAN's function using 2 neural networks: the <i>generator</i> and the <i>discriminator</i>. The former would be Tom and the latter would be Jerry.
</p>
<p>
   GNR's implementation borrows heavily from an existing system called <i>StyleGAN2</i>. The main difference is that adversarial loss is derived from the introduced content and style definitions, ensuring diversity of outputs when repeatedly transforming the same input face image. 
</p>
<p>The current implementation was trained on the <i>selfie2anime</i> dataset and transforms real human faces into anime faces. Due to limitations of the dataset, GNR works best when working with <b>female face inputs</b> that are <b>cropped to include only the face</b> (no neck and body).</p>
                 <p>GNR was implemented by Chong, M. & Forsyth, D. (2021) in the paper GANs N' Roses: Stable, Controllable, Diverse Image to Image Translation (works for videos too!)</p>"""
gr.Interface(
    inference, 
    [gr.inputs.Image(type="pil", label="Input")], 
    gr.outputs.Image(type="pil", label="Output"),
    title=title,
    description=description,
    article=article,
    allow_flagging='never',
    examples = 
    [["sample_images/2.jpg"],["sample_images/1.JPG"],["sample_images/3.jpg"]]
    ).launch(share=True)
# demo.launch(share = True)