Update app.py
Browse files
app.py
CHANGED
|
@@ -53,7 +53,6 @@ torch.manual_seed(1)
|
|
| 53 |
logging.set_verbosity_error()
|
| 54 |
|
| 55 |
# Set device
|
| 56 |
-
torch_device = "cpu"
|
| 57 |
#if "mps" == torch_device: os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"
|
| 58 |
|
| 59 |
"""## Loading the models
|
|
@@ -91,7 +90,7 @@ If all you want is to make a picture with some text, you could ignore this noteb
|
|
| 91 |
|
| 92 |
What we want to do in this notebook is dig a little deeper into how this works, so we'll start by checking that the example code runs. Again, this is adapted from the [HF notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) and looks very similar to what you'll find if you inspect [the `__call__()` method of the stable diffusion pipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L200).
|
| 93 |
"""
|
| 94 |
-
|
| 95 |
# Some settings
|
| 96 |
prompt = ["A watercolor painting of an otter"]
|
| 97 |
height = 512 # default height of Stable Diffusion
|
|
@@ -400,7 +399,7 @@ The token is fed to the `token_embedding` to transform it into a vector. The fun
|
|
| 400 |
|
| 401 |
We can look at the embedding layer:
|
| 402 |
"""
|
| 403 |
-
|
| 404 |
# Access the embedding layer
|
| 405 |
token_emb_layer = text_encoder.text_model.embeddings.token_embedding
|
| 406 |
token_emb_layer # Vocab size 49408, emb_dim 768
|
|
@@ -881,7 +880,7 @@ num_inference_steps = 50 #@param # Number of denoising steps
|
|
| 881 |
guidance_scale = 8 #@param # Scale for classifier-free guidance
|
| 882 |
generator = torch.manual_seed(0) # Seed generator to create the inital latent noise
|
| 883 |
batch_size = 1
|
| 884 |
-
|
| 885 |
|
| 886 |
# Prep text
|
| 887 |
text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
|
@@ -936,7 +935,7 @@ for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps))
|
|
| 936 |
denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
|
| 937 |
|
| 938 |
# Calculate loss
|
| 939 |
-
loss = orange_loss(denoised_images) *
|
| 940 |
|
| 941 |
# Occasionally print it out
|
| 942 |
if i%10==0:
|
|
@@ -963,7 +962,7 @@ num_inference_steps = 50 #@param # Number of denoising steps
|
|
| 963 |
guidance_scale = 8 #@param # Scale for classifier-free guidance
|
| 964 |
generator = torch.manual_seed(77) # Seed generator to create the inital latent noise
|
| 965 |
batch_size = 1
|
| 966 |
-
|
| 967 |
|
| 968 |
# Prep text
|
| 969 |
text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
|
@@ -1018,7 +1017,7 @@ for i, t in tqdm(enumerate(scheduler.timesteps), total=len(scheduler.timesteps))
|
|
| 1018 |
denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
|
| 1019 |
|
| 1020 |
# Calculate loss
|
| 1021 |
-
loss = orange_loss(denoised_images) *
|
| 1022 |
|
| 1023 |
# Occasionally print it out
|
| 1024 |
if i%10==0:
|
|
@@ -1045,7 +1044,7 @@ num_inference_steps = 50 #@param # Number of denoising steps
|
|
| 1045 |
guidance_scale = 8 #@param # Scale for classifier-free guidance
|
| 1046 |
generator = torch.manual_seed(42) # Seed generator to create the inital latent noise
|
| 1047 |
batch_size = 1
|
| 1048 |
-
|
| 1049 |
|
| 1050 |
# Prep text
|
| 1051 |
text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
|
@@ -1135,7 +1134,28 @@ This notebook was written by Jonathan Whitaker, adapted from ['Grokking Stable D
|
|
| 1135 |
|
| 1136 |
import gradio as gr
|
| 1137 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1138 |
|
|
|
|
| 1139 |
dict_styles = {'<gartic-phone>':'styles/learned_embeds_gartic-phone.bin',
|
| 1140 |
'<hawaiian shirt>':'styles/learned_embeds_hawaiian-shirt.bin',
|
| 1141 |
'<gp>': 'styles/learned_embeds_phone01.bin',
|
|
@@ -1147,7 +1167,7 @@ def inference(prompt, style):
|
|
| 1147 |
|
| 1148 |
if prompt is not None and style is not None:
|
| 1149 |
style = dict_styles[style]
|
| 1150 |
-
result =
|
| 1151 |
return np.array(result)
|
| 1152 |
else:
|
| 1153 |
return None
|
|
@@ -1168,5 +1188,6 @@ demo = gr.Interface(inference,
|
|
| 1168 |
# examples = examples,
|
| 1169 |
# cache_examples=True
|
| 1170 |
)
|
|
|
|
| 1171 |
demo.launch()
|
| 1172 |
|
|
|
|
| 53 |
logging.set_verbosity_error()
|
| 54 |
|
| 55 |
# Set device
|
|
|
|
| 56 |
#if "mps" == torch_device: os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = "1"
|
| 57 |
|
| 58 |
"""## Loading the models
|
|
|
|
| 90 |
|
| 91 |
What we want to do in this notebook is dig a little deeper into how this works, so we'll start by checking that the example code runs. Again, this is adapted from the [HF notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/diffusers/stable_diffusion.ipynb) and looks very similar to what you'll find if you inspect [the `__call__()` method of the stable diffusion pipeline](https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py#L200).
|
| 92 |
"""
|
| 93 |
+
##########################################################################################
|
| 94 |
# Some settings
|
| 95 |
prompt = ["A watercolor painting of an otter"]
|
| 96 |
height = 512 # default height of Stable Diffusion
|
|
|
|
| 399 |
|
| 400 |
We can look at the embedding layer:
|
| 401 |
"""
|
| 402 |
+
#########################################################################################
|
| 403 |
# Access the embedding layer
|
| 404 |
token_emb_layer = text_encoder.text_model.embeddings.token_embedding
|
| 405 |
token_emb_layer # Vocab size 49408, emb_dim 768
|
|
|
|
| 880 |
guidance_scale = 8 #@param # Scale for classifier-free guidance
|
| 881 |
generator = torch.manual_seed(0) # Seed generator to create the inital latent noise
|
| 882 |
batch_size = 1
|
| 883 |
+
blue_loss_scale = 200 #@param
|
| 884 |
|
| 885 |
# Prep text
|
| 886 |
text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
|
|
|
| 935 |
denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
|
| 936 |
|
| 937 |
# Calculate loss
|
| 938 |
+
loss = orange_loss(denoised_images) * blue_loss_scale
|
| 939 |
|
| 940 |
# Occasionally print it out
|
| 941 |
if i%10==0:
|
|
|
|
| 962 |
guidance_scale = 8 #@param # Scale for classifier-free guidance
|
| 963 |
generator = torch.manual_seed(77) # Seed generator to create the inital latent noise
|
| 964 |
batch_size = 1
|
| 965 |
+
blue_loss_scale = 200 #@param
|
| 966 |
|
| 967 |
# Prep text
|
| 968 |
text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
|
|
|
| 1017 |
denoised_images = vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5 # range (0, 1)
|
| 1018 |
|
| 1019 |
# Calculate loss
|
| 1020 |
+
loss = orange_loss(denoised_images) * blue_loss_scale
|
| 1021 |
|
| 1022 |
# Occasionally print it out
|
| 1023 |
if i%10==0:
|
|
|
|
| 1044 |
guidance_scale = 8 #@param # Scale for classifier-free guidance
|
| 1045 |
generator = torch.manual_seed(42) # Seed generator to create the inital latent noise
|
| 1046 |
batch_size = 1
|
| 1047 |
+
blue_loss_scale = 200 #@param
|
| 1048 |
|
| 1049 |
# Prep text
|
| 1050 |
text_input = tokenizer([prompt], padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")
|
|
|
|
| 1134 |
|
| 1135 |
import gradio as gr
|
| 1136 |
|
| 1137 |
+
def generate_image_from_prompt(text_in, style_in):
|
| 1138 |
+
STYLE_LIST = ['oil_style.bin', 'valorant_style.bin', 'cartoon_syle.bin', 'space_style.bin', 'terraria_syle.bin']
|
| 1139 |
+
STYLE_SEEDS = [128, 64, 128, 64, 128]
|
| 1140 |
+
|
| 1141 |
+
print(text_in)
|
| 1142 |
+
print(style_in)
|
| 1143 |
+
style_file = style_in + '_style.bin'
|
| 1144 |
+
idx = STYLE_LIST.index(style_file)
|
| 1145 |
+
print(style_file)
|
| 1146 |
+
print(idx)
|
| 1147 |
+
|
| 1148 |
+
prompt = text_in
|
| 1149 |
+
|
| 1150 |
+
style_seed = STYLE_SEEDS[idx]
|
| 1151 |
+
style_dict = torch.load(style_file)
|
| 1152 |
+
style_embed = [v for v in style_dict.values()]
|
| 1153 |
+
|
| 1154 |
+
generated_image = embed_style(prompt, style_embed[0], style_seed)
|
| 1155 |
+
|
| 1156 |
+
loss_generated_img = (loss_style(prompt, style_embed[0], style_seed))
|
| 1157 |
|
| 1158 |
+
return [generated_image, loss_generated_img]
|
| 1159 |
dict_styles = {'<gartic-phone>':'styles/learned_embeds_gartic-phone.bin',
|
| 1160 |
'<hawaiian shirt>':'styles/learned_embeds_hawaiian-shirt.bin',
|
| 1161 |
'<gp>': 'styles/learned_embeds_phone01.bin',
|
|
|
|
| 1167 |
|
| 1168 |
if prompt is not None and style is not None:
|
| 1169 |
style = dict_styles[style]
|
| 1170 |
+
result = generate_image_from_prompt(prompt, style)
|
| 1171 |
return np.array(result)
|
| 1172 |
else:
|
| 1173 |
return None
|
|
|
|
| 1188 |
# examples = examples,
|
| 1189 |
# cache_examples=True
|
| 1190 |
)
|
| 1191 |
+
|
| 1192 |
demo.launch()
|
| 1193 |
|