Spaces:

RaviNaik
/

ERA-SESSION20

Runtime error

App Files Files Community

ravi.naik commited on Oct 17, 2023

Commit

b87b512

1 Parent(s): 19147b3

Added source, experiments, gradio app for stable diffusion

Browse files

Files changed (12) hide show

.gitattributes +1 -0
.gitignore +160 -0
app.py +91 -0
experiments/Stable Diffusion Deep Dive.ipynb +3 -0
experiments/exp.ipynb +3 -0
experiments/exp1.ipynb +3 -0
experiments/exp2.ipynb +3 -0
experiments/exp3.ipynb +3 -0
experiments/exp4.ipynb +3 -0
experiments/exp5.ipynb +3 -0
src/stable_diffusion.py +222 -0
src/utils.py +11 -0

.gitattributes CHANGED Viewed

@@ -1,4 +1,5 @@
 *.7z filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text

 *.7z filter=lfs diff=lfs merge=lfs -text
+*.ipynb filter=lfs diff=lfs merge=lfs -text
 *.arrow filter=lfs diff=lfs merge=lfs -text
 *.bin filter=lfs diff=lfs merge=lfs -text
 *.bz2 filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

app.py ADDED Viewed

	@@ -0,0 +1,91 @@

+import gradio as gr
+import random
+import torch
+import pathlib
+from src.utils import concept_styles, loss_fn
+from src.stable_diffusion import StableDiffusion
+PROJECT_PATH = "."
+CONCEPT_LIBS_PATH = f"{PROJECT_PATH}/concept_libs"
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+def generate(prompt, styles, gen_steps, loss_scale):
+    lossless_images, lossy_images = [], []
+    for style in styles:
+        concept_lib_path = f"{CONCEPT_LIBS_PATH}/{concept_styles[style]}"
+        concept_lib = pathlib.Path(concept_lib_path)
+        concept_embed = torch.load(concept_lib)
+        manual_seed = random.randint(0, 100)
+        diffusion = StableDiffusion(
+            device=DEVICE,
+            num_inference_steps=gen_steps,
+            manual_seed=manual_seed,
+        )
+        generated_image_lossless = diffusion.generate_image(
+            prompt=prompt,
+            loss_fn=loss_fn,
+            loss_scale=0,
+            concept_embed=concept_embed,
+        )
+        generated_image_lossy = diffusion.generate_image(
+            prompt=prompt,
+            loss_fn=loss_fn,
+            loss_scale=loss_scale,
+            concept_embed=concept_embed,
+        )
+        lossless_images.append((generated_image_lossless, style))
+        lossy_images.append((generated_image_lossy, style))
+    return {lossless_gallery: lossless_images, lossy_gallery: lossy_images}
+with gr.Blocks() as app:
+    gr.Markdown("## ERA Session20 - Stable Diffusion: Generative Art with Guidance")
+    with gr.Row():
+        with gr.Column():
+            prompt_box = gr.Textbox(label="Prompt", interactive=True)
+            style_selector = gr.Dropdown(
+                choices=list(concept_styles.keys()),
+                value=list(concept_styles.keys())[0],
+                multiselect=True,
+                label="Select a Concept Style",
+                interactive=True,
+            )
+            gen_steps = gr.Slider(
+                minimum=10,
+                maximum=50,
+                value=30,
+                step=10,
+                label="Select Number of Steps",
+                interactive=True,
+            )
+            loss_scale = gr.Slider(
+                minimum=0,
+                maximum=32,
+                value=8,
+                step=8,
+                label="Select Guidance Scale",
+                interactive=True,
+            )
+            submit_btn = gr.Button(value="Generate")
+        with gr.Column():
+            lossless_gallery = gr.Gallery(
+                label="Generated Images without Guidance", show_label=True
+            )
+            lossy_gallery = gr.Gallery(
+                label="Generated Images with Guidance", show_label=True
+            )
+        submit_btn.click(
+            generate,
+            inputs=[prompt_box, style_selector, gen_steps, loss_scale],
+            outputs=[lossless_gallery, lossy_gallery],
+        )
+app.launch()

experiments/Stable Diffusion Deep Dive.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ff21d4579bafcd26c5ec593bd9020c65b85e552a1a8645dc60cf3eeddec3126
+size 8313731

experiments/exp.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e55d79ab1ba786bbcce564b743caf8064c69aa24dddd46e851a974329348e312
+size 2470336

experiments/exp1.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f53f8647069e798a316493db4f0e09ef0b798e2c74636f4465cc35236d9e5130
+size 3992987

experiments/exp2.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f8f946cb2f730f609ad3bb2b38b55ca142fc5c98916e6415586ab23f71aedbd8
+size 4713617

experiments/exp3.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0cfe21389c210759611b9c14e02cdee663bee21276f0f7dbdc326d35899a9dd3
+size 1108233

experiments/exp4.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:71245f3f38295fec8dd0170face4539b6c176f0385a4c26868e84249a12e1ffd
+size 18169187

experiments/exp5.ipynb ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d42fb890c72a04192dcedbd6b583e8c1666f5217f0b8bd69f4a1834eeab5a45c
+size 49514010

src/stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,222 @@

+import torch
+from diffusers import AutoencoderKL, LMSDiscreteScheduler, UNet2DConditionModel
+from transformers import CLIPTextModel, CLIPTokenizer
+from PIL import Image
+from tqdm import tqdm
+class StableDiffusion:
+    def __init__(
+        self,
+        vae_arch="CompVis/stable-diffusion-v1-4",
+        tokenizer_arch="openai/clip-vit-large-patch14",
+        encoder_arch="openai/clip-vit-large-patch14",
+        unet_arch="CompVis/stable-diffusion-v1-4",
+        device="cpu",
+        height=512,
+        width=512,
+        num_inference_steps=30,
+        guidance_scale=7.5,
+        manual_seed=1,
+    ) -> None:
+        self.height = height  # default height of Stable Diffusion
+        self.width = width  # default width of Stable Diffusion
+        self.num_inference_steps = num_inference_steps  # Number of denoising steps
+        self.guidance_scale = guidance_scale  # Scale for classifier-free guidance
+        self.device = device
+        self.manual_seed = manual_seed
+        vae = AutoencoderKL.from_pretrained(vae_arch, subfolder="vae")
+        # Load the tokenizer and text encoder to tokenize and encode the text.
+        self.tokenizer = CLIPTokenizer.from_pretrained(tokenizer_arch)
+        text_encoder = CLIPTextModel.from_pretrained(encoder_arch)
+        # The UNet model for generating the latents.
+        unet = UNet2DConditionModel.from_pretrained(unet_arch, subfolder="unet")
+        # The noise scheduler
+        self.scheduler = LMSDiscreteScheduler(
+            beta_start=0.00085,
+            beta_end=0.012,
+            beta_schedule="scaled_linear",
+            num_train_timesteps=1000,
+        )
+        # To the GPU we go!
+        self.vae = vae.to(self.device)
+        self.text_encoder = text_encoder.to(self.device)
+        self.unet = unet.to(self.device)
+        self.token_emb_layer = text_encoder.text_model.embeddings.token_embedding
+        pos_emb_layer = text_encoder.text_model.embeddings.position_embedding
+        position_ids = text_encoder.text_model.embeddings.position_ids[:, :77]
+        self.position_embeddings = pos_emb_layer(position_ids)
+    def get_output_embeds(self, input_embeddings):
+        # CLIP's text model uses causal mask, so we prepare it here:
+        bsz, seq_len = input_embeddings.shape[:2]
+        causal_attention_mask = (
+            self.text_encoder.text_model._build_causal_attention_mask(
+                bsz, seq_len, dtype=input_embeddings.dtype
+            )
+        )
+        # Getting the output embeddings involves calling the model with passing output_hidden_states=True
+        # so that it doesn't just return the pooled final predictions:
+        encoder_outputs = self.text_encoder.text_model.encoder(
+            inputs_embeds=input_embeddings,
+            attention_mask=None,  # We aren't using an attention mask so that can be None
+            causal_attention_mask=causal_attention_mask.to(self.device),
+            output_attentions=None,
+            output_hidden_states=True,  # We want the output embs not the final output
+            return_dict=None,
+        )
+        # We're interested in the output hidden state only
+        output = encoder_outputs[0]
+        # There is a final layer norm we need to pass these through
+        output = self.text_encoder.text_model.final_layer_norm(output)
+        # And now they're ready!
+        return output
+    def set_timesteps(self, scheduler, num_inference_steps):
+        scheduler.set_timesteps(num_inference_steps)
+        scheduler.timesteps = scheduler.timesteps.to(torch.float32)
+    def latents_to_pil(self, latents):
+        # bath of latents -> list of images
+        latents = (1 / 0.18215) * latents
+        with torch.no_grad():
+            image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
+        images = (image * 255).round().astype("uint8")
+        pil_images = [Image.fromarray(image) for image in images]
+        return pil_images
+    def generate_with_embs(self, text_embeddings, text_input, loss_fn, loss_scale):
+        generator = torch.manual_seed(
+            self.manual_seed
+        )  # Seed generator to create the inital latent noise
+        batch_size = 1
+        max_length = text_input.input_ids.shape[-1]
+        uncond_input = self.tokenizer(
+            [""] * batch_size,
+            padding="max_length",
+            max_length=max_length,
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(self.device)
+            )[0]
+        text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        # Prep Scheduler
+        self.set_timesteps(self.scheduler, self.num_inference_steps)
+        # Prep latents
+        latents = torch.randn(
+            (batch_size, self.unet.in_channels, self.height // 8, self.width // 8),
+            generator=generator,
+        )
+        latents = latents.to(self.device)
+        latents = latents * self.scheduler.init_noise_sigma
+        # Loop
+        for i, t in tqdm(
+            enumerate(self.scheduler.timesteps), total=len(self.scheduler.timesteps)
+        ):
+            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+            latent_model_input = torch.cat([latents] * 2)
+            sigma = self.scheduler.sigmas[i]
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            # predict the noise residual
+            with torch.no_grad():
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=text_embeddings
+                )["sample"]
+            # perform guidance
+            noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+            noise_pred = noise_pred_uncond + self.guidance_scale * (
+                noise_pred_text - noise_pred_uncond
+            )
+            if i % 5 == 0:
+                # Requires grad on the latents
+                latents = latents.detach().requires_grad_()
+                # Get the predicted x0:
+                # latents_x0 = latents - sigma * noise_pred
+                latents_x0 = self.scheduler.step(
+                    noise_pred, t, latents
+                ).pred_original_sample
+                # Decode to image space
+                denoised_images = (
+                    self.vae.decode((1 / 0.18215) * latents_x0).sample / 2 + 0.5
+                )  # range (0, 1)
+                # Calculate loss
+                loss = loss_fn(denoised_images) * loss_scale
+                # Occasionally print it out
+                # if i % 10 == 0:
+                #     print(i, "loss:", loss.item())
+                # Get gradient
+                cond_grad = torch.autograd.grad(loss, latents)[0]
+                # Modify the latents based on this gradient
+                latents = latents.detach() - cond_grad * sigma**2
+                self.scheduler._step_index = self.scheduler._step_index - 1
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        return self.latents_to_pil(latents)[0]
+    def generate_image(
+        self,
+        prompt="A campfire (oil on canvas)",
+        loss_fn=None,
+        loss_scale=200,
+        concept_embed=None,  # birb_embed["<birb-style>"]
+    ):
+        prompt += " in the style of cs"
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        input_ids = text_input.input_ids.to(self.device)
+        custom_style_token = self.tokenizer.encode("cs", add_special_tokens=False)[0]
+        # Get token embeddings
+        token_embeddings = self.token_emb_layer(input_ids)
+        # The new embedding - our special birb word
+        embed_key = list(concept_embed.keys())[0]
+        replacement_token_embedding = concept_embed[embed_key]
+        # Insert this into the token embeddings
+        token_embeddings[
+            0, torch.where(input_ids[0] == custom_style_token)
+        ] = replacement_token_embedding.to(self.device)
+        # token_embeddings = token_embeddings + (replacement_token_embedding * 0.9)
+        # Combine with pos embs
+        input_embeddings = token_embeddings + self.position_embeddings
+        #  Feed through to get final output embs
+        modified_output_embeddings = self.get_output_embeds(input_embeddings)
+        # And generate an image with this:
+        generated_image = self.generate_with_embs(
+            modified_output_embeddings, text_input, loss_fn, loss_scale
+        )
+        return generated_image

src/utils.py ADDED Viewed

	@@ -0,0 +1,11 @@

+def loss_fn(images):
+    return -images.median() / 3
+concept_styles = {
+    "Allante": "allante.bin",
+    "XYZ": "xyz.bin",
+    "Moebius": "moebius.bin",
+    "Oil Style": "oil_style",
+    "Polygons": "poly.bin",
+}