Spaces:

Shuang59
/

Composable-Diffusion

Runtime error

App Files Files Community

Shuang59 commited on Aug 24, 2022

Commit

8d6462e

1 Parent(s): eb601c1

Add stable diffusion for compositional generation.

Browse files

Files changed (1) hide show

app.py +23 -4

app.py CHANGED Viewed

@@ -25,6 +25,10 @@ from composable_diffusion.model_creation import model_and_diffusion_defaults as
 from PIL import Image
 # This notebook supports both CPU and GPU.
 # On CPU, generating one sample may take on the order of 20 minutes.
 # On a GPU, it should be under a minute.
@@ -33,6 +37,12 @@ has_cuda = th.cuda.is_available()
 device = th.device('cpu' if not has_cuda else 'cuda')
 print(device)
 # Create base model.
 timestep_respacing = 100  # @param{type: 'number'}
 options = model_and_diffusion_defaults()
@@ -276,9 +286,17 @@ def compose_clevr_objects(prompt, guidance_scale):
     return out_img
 def compose(prompt, version, guidance_scale):
     if version == 'GLIDE':
         return compose_language_descriptions(prompt, guidance_scale)
     else:
         return compose_clevr_objects(prompt, guidance_scale)
@@ -286,14 +304,15 @@ def compose(prompt, version, guidance_scale):
 examples_1 = 'a camel | a forest'
 examples_2 = 'A cloudy blue sky  | A mountain in the horizon | Cherry Blossoms in front of the mountain'
 examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'
-examples = [[examples_1, 'GLIDE', 10], [examples_2, 'GLIDE', 10], [examples_3, 'CLEVR Objects', 10]]
 import gradio as gr
 title = 'Compositional Visual Generation with Composable Diffusion Models'
-description = '<p>Demo for Composable Diffusion<ul><li>~30s per GLIDE example</li><li>~10s per CLEVR Object example</li>(<b>Note</b>: time is measured by per example if gpu is used, otherwise it will take quite a bit of time.)</ul></p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing  multiple sentences, use `|` as the delimiter, see given examples below.</p>'
-iface = gr.Interface(compose, inputs=["text", gr.Radio(['GLIDE', 'CLEVR Objects'], type="value", label='version'), gr.Slider(1, 20)], outputs='image',
                      title=title, description=description, examples=examples)
-iface.launch()

 from PIL import Image
+from torch import autocast
+from diffusers import StableDiffusionPipeline
 # This notebook supports both CPU and GPU.
 # On CPU, generating one sample may take on the order of 20 minutes.
 # On a GPU, it should be under a minute.
 device = th.device('cpu' if not has_cuda else 'cuda')
 print(device)
+# iniatilize stable diffusion model
+pipe = StableDiffusionPipeline.from_pretrained(
+    "CompVis/stable-diffusion-v1-4",
+    use_auth_token=True
+).to(device)
 # Create base model.
 timestep_respacing = 100  # @param{type: 'number'}
 options = model_and_diffusion_defaults()
     return out_img
+def stable_diffusion_compose(prompt, scale):
+    with autocast('cpu' if not has_cuda else 'cuda'):
+        image = pipe(prompt, guidance_scale=scale)["sample"][0]
+        return image
 def compose(prompt, version, guidance_scale):
     if version == 'GLIDE':
         return compose_language_descriptions(prompt, guidance_scale)
+    elif version == 'Stable_Diffusion_1v_4':
+        return stable_diffusion_compose(prompt, guidance_scale)
     else:
         return compose_clevr_objects(prompt, guidance_scale)
 examples_1 = 'a camel | a forest'
 examples_2 = 'A cloudy blue sky  | A mountain in the horizon | Cherry Blossoms in front of the mountain'
 examples_3 = '0.1, 0.5 | 0.3, 0.5 | 0.5, 0.5 | 0.7, 0.5 | 0.9, 0.5'
+examples_4 = 'a river leading into a mountain | red trees on the side'
+examples = [[examples_1, 'GLIDE', 10], [examples_4, 'Stable_Diffusion_1v_4', 10], [examples_2, 'GLIDE', 10], [examples_3, 'CLEVR Objects', 10]]
 import gradio as gr
 title = 'Compositional Visual Generation with Composable Diffusion Models'
+description = '<p>Demo for Composable Diffusion<ul><li>~30s per GLIDE/Stable-Diffusion example</li><li>~10s per CLEVR Object example</li>(<b>Note</b>: time is varied depending on what gpu is used.)</ul></p><p>See more information from our <a href="https://energy-based-model.github.io/Compositional-Visual-Generation-with-Composable-Diffusion-Models/">Project Page</a>.</p><ul><li>One version is based on the released <a href="https://github.com/openai/glide-text2im">GLIDE</a> and <a href="https://github.com/CompVis/stable-diffusion/">Stable Diffusion</a> for composing natural language description.</li><li>Another is based on our pre-trained CLEVR Object Model for composing objects. <br>(<b>Note</b>: We recommend using <b><i>x</i></b> in range <b><i>[0.1, 0.9]</i></b> and <b><i>y</i></b> in range <b><i>[0.25, 0.7]</i></b>, since the training dataset labels are in given ranges.)</li></ul><p>When composing  multiple sentences, use `|` as the delimiter, see given examples below.</p>'
+iface = gr.Interface(compose, inputs=["text", gr.Radio(['Stable_Diffusion_1v_4', 'GLIDE', 'CLEVR Objects'], type="value", label='version'), gr.Slider(2, 20)], outputs='image',
                      title=title, description=description, examples=examples)
+iface.launch()