Spaces:

lint
/

sdpipe_webui

Runtime error

App Files Files Community

1lint commited on Jan 25, 2023

Commit

919fef8

1 Parent(s): a4b1962

refactor code and fix cpu support

Browse files

Files changed (11) hide show

app.py +58 -286
footer.html → html/footer.html +0 -0
header.html → html/header.html +0 -0
style.css → html/style.css +0 -0
model_ids.txt +6 -0
test.ipynb +73 -0
utils/__init__.py +0 -0
utils/functions.py +273 -0
inpaint_pipeline.py → utils/inpaint_pipeline.py +63 -22
utils/shared.py +16 -0
textual_inversion.py → utils/textual_inversion.py +216 -78

app.py CHANGED Viewed

@@ -1,283 +1,19 @@
-# inpaint pipeline with fix to avoid noise added to latents during final iteration of denoising loop
-from inpaint_pipeline import SDInpaintPipeline as StableDiffusionInpaintPipelineLegacy
-from diffusers import (
-    StableDiffusionPipeline,
-    StableDiffusionImg2ImgPipeline,
-)
-import diffusers.schedulers
 import gradio as gr
-import torch
-import random
 from multiprocessing import cpu_count
-import json
-from PIL import Image
-import os
-import argparse
-import shutil
-import gc
-import importlib
-from textual_inversion import main as run_textual_inversion
-def pad_image(image):
-    w, h = image.size
-    if w == h:
-        return image
-    elif w > h:
-        new_image = Image.new(image.mode, (w, w), (0, 0, 0))
-        new_image.paste(image, (0, (w - h) // 2))
-        return new_image
-    else:
-        new_image = Image.new(image.mode, (h, h), (0, 0, 0))
-        new_image.paste(image, ((h - w) // 2, 0))
-        return new_image
-_xformers_available = importlib.util.find_spec("xformers") is not None
-device = "cuda" if torch.cuda.is_available() else "cpu"
-low_vram_mode = False
-# scheduler dict includes superclass SchedulerMixin (it still generates reasonable images)
-scheduler_dict = {
-    k: v
-    for k, v in diffusers.schedulers.__dict__.items()
-    if "Scheduler" in k and "Flax" not in k
-}
-scheduler_dict.pop(
-    "VQDiffusionScheduler"
-)  # requires unique parameter, unlike other schedulers
-scheduler_names = list(scheduler_dict.keys())
-default_scheduler = scheduler_names[3]  # expected to be DPM Multistep
-model_ids = [
-    "andite/anything-v4.0",
-    "hakurei/waifu-diffusion",
-    "prompthero/openjourney-v2",
-    "runwayml/stable-diffusion-v1-5",
-    "johnslegers/epic-diffusion",
-    "stabilityai/stable-diffusion-2-1",
-]
-loaded_model_id = ""
-def load_pipe(
-    model_id, scheduler_name, pipe_class=StableDiffusionPipeline, pipe_kwargs="{}"
-):
-    global pipe, loaded_model_id
-    scheduler = scheduler_dict[scheduler_name]
-    # load new weights from disk only when changing model_id
-    if model_id != loaded_model_id:
-        pipe = pipe_class.from_pretrained(
-            model_id,
-            torch_dtype=torch.float16,
-            safety_checker=None,
-            requires_safety_checker=False,
-            scheduler=scheduler.from_pretrained(model_id, subfolder="scheduler"),
-            **json.loads(pipe_kwargs),
-        )
-        loaded_model_id = model_id
-    # if same model_id, instantiate new pipeline with same underlying pytorch objects to avoid reloading weights from disk
-    elif pipe_class != pipe.__class__ or not isinstance(pipe.scheduler, scheduler):
-        pipe.components["scheduler"] = scheduler.from_pretrained(
-            model_id, subfolder="scheduler"
-        )
-        pipe = pipe_class(**pipe.components)
-    if device == 'cuda':
-        pipe = pipe.to(device)
-        if _xformers_available:
-            pipe.enable_xformers_memory_efficient_attention()
-            print("using xformers")
-        if low_vram_mode:
-            pipe.enable_attention_slicing()
-            print("using attention slicing to lower VRAM")
-    return pipe
-pipe = None
-pipe = load_pipe(model_ids[0], default_scheduler)
-@torch.autocast(device)
-@torch.no_grad()
-def generate(
-    model_name,
-    scheduler_name,
-    prompt,
-    guidance,
-    steps,
-    n_images=1,
-    width=512,
-    height=512,
-    seed=0,
-    image=None,
-    strength=0.5,
-    inpaint_image=None,
-    inpaint_strength=0.5,
-    inpaint_radio='',
-    neg_prompt="",
-    pipe_class=StableDiffusionPipeline,
-    pipe_kwargs="{}",
-):
-    if seed == -1:
-        seed = random.randint(0, 2147483647)
-    generator = torch.Generator("cuda").manual_seed(seed)
-    pipe = load_pipe(
-        model_id=model_name,
-        scheduler_name=scheduler_name,
-        pipe_class=pipe_class,
-        pipe_kwargs=pipe_kwargs,
-    )
-    status_message = (
-        f"Prompt: '{prompt}' | Seed: {seed} | Guidance: {guidance} | Scheduler: {scheduler_name} | Steps: {steps}"
-    )
-    if pipe_class == StableDiffusionPipeline:
-        status_message = "Text to Image " + status_message
-        result = pipe(
-            prompt,
-            negative_prompt=neg_prompt,
-            num_images_per_prompt=n_images,
-            num_inference_steps=int(steps),
-            guidance_scale=guidance,
-            width=width,
-            height=height,
-            generator=generator,
-        )
-    elif pipe_class == StableDiffusionImg2ImgPipeline:
-        status_message = "Image to Image " + status_message
-        print(image.size)
-        image = image.resize((width, height))
-        print(image.size)
-        result = pipe(
-            prompt,
-            negative_prompt=neg_prompt,
-            num_images_per_prompt=n_images,
-            image=image,
-            num_inference_steps=int(steps),
-            strength=strength,
-            guidance_scale=guidance,
-            generator=generator,
-        )
-    elif pipe_class == StableDiffusionInpaintPipelineLegacy:
-        status_message = "Inpainting " + status_message
-        init_image = inpaint_image["image"].resize((width, height))
-        mask = inpaint_image["mask"].resize((width, height))
-        result = pipe(
-            prompt,
-            negative_prompt=neg_prompt,
-            num_images_per_prompt=n_images,
-            image=init_image,
-            mask_image=mask,
-            num_inference_steps=int(steps),
-            strength=inpaint_strength,
-            preserve_unmasked_image=(inpaint_radio == inpaint_options[0]),
-            guidance_scale=guidance,
-            generator=generator,
-        )
-    else:
-        return None, f"Unhandled pipeline class: {pipe_class}", -1
-    return result.images, status_message, seed
-# based on lvkaokao/textual-inversion-training
-def train_textual_inversion(model_name, scheduler_name, type_of_thing, files, concept_word, init_word, text_train_steps, text_train_bsz, text_learning_rate, progress=gr.Progress(track_tqdm=True)):
-    pipe = load_pipe(
-        model_id=model_name,
-        scheduler_name=scheduler_name,
-        pipe_class=StableDiffusionPipeline,
-    )
-    pipe.disable_xformers_memory_efficient_attention() # xformers handled by textual inversion script
-    concept_dir = 'concept_images'
-    output_dir = 'output_model'
-    training_resolution = 512
-    if os.path.exists(output_dir): shutil.rmtree('output_model')
-    if os.path.exists(concept_dir): shutil.rmtree('concept_images')
-    os.makedirs(concept_dir, exist_ok=True)
-    os.makedirs(output_dir, exist_ok=True)
-    gc.collect()
-    torch.cuda.empty_cache()
-    if(prompt == "" or prompt == None):
-        raise gr.Error("You forgot to define your concept prompt")
-    for j, file_temp in enumerate(files):
-        file = Image.open(file_temp.name)
-        image = pad_image(file)
-        image = image.resize((training_resolution, training_resolution))
-        extension = file_temp.name.split(".")[1]
-        image = image.convert('RGB')
-        image.save(f'{concept_dir}/{j+1}.{extension}', quality=100)
-    args_general = argparse.Namespace(
-            train_data_dir=concept_dir,
-            learnable_property=type_of_thing,
-            placeholder_token=concept_word,
-            initializer_token=init_word,
-            resolution=training_resolution,
-            train_batch_size=text_train_bsz,
-            gradient_accumulation_steps=1,
-            gradient_checkpointing=True,
-            mixed_precision='fp16',
-            use_bf16=False,
-            max_train_steps=int(text_train_steps),
-            learning_rate=text_learning_rate,
-            scale_lr=True,
-            lr_scheduler="constant",
-            lr_warmup_steps=0,
-            output_dir=output_dir,
-        )
-    try:
-        final_result = run_textual_inversion(pipe, args_general)
-    except Exception as e:
-        raise gr.Error(e)
-    gc.collect()
-    torch.cuda.empty_cache()
-    return f'Finished training! Check the {output_dir} directory for saved model weights'
 default_img_size = 512
-with open("header.html") as fp:
     header = fp.read()
-with open("footer.html") as fp:
     footer = fp.read()
-with gr.Blocks(css="style.css") as demo:
-    pipe_state = gr.State(lambda: StableDiffusionPipeline)
     gr.HTML(header)
@@ -293,7 +29,7 @@ with gr.Blocks(css="style.css") as demo:
         with gr.Column(scale=30):
             model_name = gr.Dropdown(
-                label="Model", choices=model_ids, value=loaded_model_id
             )
             scheduler_name = gr.Dropdown(
                 label="Scheduler", choices=scheduler_names, value=default_scheduler
@@ -305,10 +41,10 @@ with gr.Blocks(css="style.css") as demo:
         with gr.Column():
             with gr.Tab("Text to Image") as tab:
-                tab.select(lambda: StableDiffusionPipeline, [], pipe_state)
             with gr.Tab("Image to image") as tab:
-                tab.select(lambda: StableDiffusionImg2ImgPipeline, [], pipe_state)
                 image = gr.Image(
                     label="Image to Image",
@@ -326,7 +62,7 @@ with gr.Blocks(css="style.css") as demo:
                 )
             with gr.Tab("Inpainting") as tab:
-                tab.select(lambda: StableDiffusionInpaintPipelineLegacy, [], pipe_state)
                 inpaint_image = gr.Image(
                     label="Inpainting",
@@ -342,13 +78,26 @@ with gr.Blocks(css="style.css") as demo:
                     step=0.02,
                     value=0.8,
                 )
-                inpaint_options = ["preserve non-masked portions of image", "output entire inpainted image"]
-                inpaint_radio = gr.Radio(inpaint_options, value=inpaint_options[0], show_label=False, interactive=True)
             with gr.Tab("Textual Inversion") as tab:
-                tab.select(lambda: StableDiffusionPipeline, [], pipe_state)
-                type_of_thing = gr.Dropdown(label="What would you like to train?", choices=["object", "person", "style"], value="object", interactive=True)
                 text_train_bsz = gr.Slider(
                     label="Training Batch Size",
@@ -358,14 +107,23 @@ with gr.Blocks(css="style.css") as demo:
                     value=1,
                 )
-                files = gr.File(label=f'''Upload the images for your concept''', file_count="multiple", interactive=True, visible=True)
                 text_train_steps = gr.Number(label="How many steps", value=1000)
-                text_learning_rate = gr.Number(label="Learning Rate", value=5.e-4)
-                concept_word = gr.Textbox(label=f'''concept word - use a unique, made up word to avoid collisions''')
-                init_word = gr.Textbox(label=f'''initial word - to init the concept embedding''')
                 textual_inversion_button = gr.Button(value="Train Textual Inversion")
@@ -436,17 +194,31 @@ with gr.Blocks(css="style.css") as demo:
         pipe_state,
         pipe_kwargs,
     ]
-    outputs = [gallery, generation_details, seed]
     prompt.submit(generate, inputs=inputs, outputs=outputs)
     generate_button.click(generate, inputs=inputs, outputs=outputs)
-    textual_inversion_inputs = [model_name, scheduler_name, type_of_thing, files, concept_word, init_word, text_train_steps, text_train_bsz, text_learning_rate]
-    textual_inversion_button.click(train_textual_inversion, inputs=textual_inversion_inputs, outputs=[training_status])
-#demo = gr.TabbedInterface([demo, dreambooth_tab], ["Main", "Dreambooth"])
 demo.queue(concurrency_count=cpu_count())

 import gradio as gr
 from multiprocessing import cpu_count
+from utils.functions import generate, train_textual_inversion
+from utils.shared import model_ids, scheduler_names, default_scheduler
 default_img_size = 512
+with open("html/header.html") as fp:
     header = fp.read()
+with open("html/footer.html") as fp:
     footer = fp.read()
+with gr.Blocks(css="html/style.css") as demo:
+    pipe_state = gr.State(lambda: 1)
     gr.HTML(header)
         with gr.Column(scale=30):
             model_name = gr.Dropdown(
+                label="Model", choices=model_ids, value=model_ids[0]
             )
             scheduler_name = gr.Dropdown(
                 label="Scheduler", choices=scheduler_names, value=default_scheduler
         with gr.Column():
             with gr.Tab("Text to Image") as tab:
+                tab.select(lambda: 1, [], pipe_state)
             with gr.Tab("Image to image") as tab:
+                tab.select(lambda: 2, [], pipe_state)
                 image = gr.Image(
                     label="Image to Image",
                 )
             with gr.Tab("Inpainting") as tab:
+                tab.select(lambda: 3, [], pipe_state)
                 inpaint_image = gr.Image(
                     label="Inpainting",
                     step=0.02,
                     value=0.8,
                 )
+                inpaint_options = [
+                    "preserve non-masked portions of image",
+                    "output entire inpainted image",
+                ]
+                inpaint_radio = gr.Radio(
+                    inpaint_options,
+                    value=inpaint_options[0],
+                    show_label=False,
+                    interactive=True,
+                )
             with gr.Tab("Textual Inversion") as tab:
+                tab.select(lambda: 4, [], pipe_state)
+                type_of_thing = gr.Dropdown(
+                    label="What would you like to train?",
+                    choices=["object", "person", "style"],
+                    value="object",
+                    interactive=True,
+                )
                 text_train_bsz = gr.Slider(
                     label="Training Batch Size",
                     value=1,
                 )
+                files = gr.File(
+                    label=f"""Upload the images for your concept""",
+                    file_count="multiple",
+                    interactive=True,
+                    visible=True,
+                )
                 text_train_steps = gr.Number(label="How many steps", value=1000)
+                text_learning_rate = gr.Number(label="Learning Rate", value=5.0e-4)
+                concept_word = gr.Textbox(
+                    label=f"""concept word - use a unique, made up word to avoid collisions"""
+                )
+                init_word = gr.Textbox(
+                    label=f"""initial word - to init the concept embedding"""
+                )
                 textual_inversion_button = gr.Button(value="Train Textual Inversion")
         pipe_state,
         pipe_kwargs,
     ]
+    outputs = [gallery, generation_details]
     prompt.submit(generate, inputs=inputs, outputs=outputs)
     generate_button.click(generate, inputs=inputs, outputs=outputs)
+    textual_inversion_inputs = [
+        model_name,
+        scheduler_name,
+        type_of_thing,
+        files,
+        concept_word,
+        init_word,
+        text_train_steps,
+        text_train_bsz,
+        text_learning_rate,
+    ]
+    textual_inversion_button.click(
+        train_textual_inversion,
+        inputs=textual_inversion_inputs,
+        outputs=[training_status],
+    )
+# demo = gr.TabbedInterface([demo, dreambooth_tab], ["Main", "Dreambooth"])
 demo.queue(concurrency_count=cpu_count())

footer.html → html/footer.html RENAMED Viewed

File without changes

header.html → html/header.html RENAMED Viewed

File without changes

style.css → html/style.css RENAMED Viewed

File without changes

model_ids.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+andite/anything-v4.0
+hakurei/waifu-diffusion
+prompthero/openjourney-v2
+runwayml/stable-diffusion-v1-5
+johnslegers/epic-diffusion
+stabilityai/stable-diffusion-2-1

test.ipynb ADDED Viewed

	@@ -0,0 +1,73 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('model_ids.txt', 'r') as fp:\n",
+    "    model_ids = fp.read().splitlines() "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "['andite/anything-v4.0',\n",
+       " 'hakurei/waifu-diffusion',\n",
+       " 'prompthero/openjourney-v2',\n",
+       " 'runwayml/stable-diffusion-v1-5',\n",
+       " 'johnslegers/epic-diffusion',\n",
+       " 'stabilityai/stable-diffusion-2-1']"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_ids"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "ml",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.8"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "cbbcdde725e9a65f1cb734ac4223fed46e03daf1eb62d8ccb3c48face3871521"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

utils/__init__.py ADDED Viewed

File without changes

utils/functions.py ADDED Viewed

	@@ -0,0 +1,273 @@

+import gradio as gr
+import torch
+import random
+from PIL import Image
+import os
+import argparse
+import shutil
+import gc
+import importlib
+import json
+from diffusers import (
+    StableDiffusionPipeline,
+    StableDiffusionImg2ImgPipeline,
+)
+from .inpaint_pipeline import SDInpaintPipeline as StableDiffusionInpaintPipelineLegacy
+from .textual_inversion import main as run_textual_inversion
+from .shared import default_scheduler, scheduler_dict, model_ids
+_xformers_available = importlib.util.find_spec("xformers") is not None
+device = "cuda" if torch.cuda.is_available() else "cpu"
+# device = 'cpu'
+dtype = torch.float16 if device == "cuda" else torch.float32
+low_vram_mode = False
+tab_to_pipeline = {
+    1: StableDiffusionPipeline,
+    2: StableDiffusionImg2ImgPipeline,
+    3: StableDiffusionInpaintPipelineLegacy,
+}
+def load_pipe(model_id, scheduler_name, tab_index=1, pipe_kwargs="{}"):
+    global pipe, loaded_model_id
+    scheduler = scheduler_dict[scheduler_name]
+    pipe_class = tab_to_pipeline[tab_index]
+    # load new weights from disk only when changing model_id
+    if model_id != loaded_model_id:
+        pipe = pipe_class.from_pretrained(
+            model_id,
+            torch_dtype=dtype,
+            safety_checker=None,
+            requires_safety_checker=False,
+            scheduler=scheduler.from_pretrained(model_id, subfolder="scheduler"),
+            **json.loads(pipe_kwargs),
+        )
+        loaded_model_id = model_id
+    # if same model_id, instantiate new pipeline with same underlying pytorch objects to avoid reloading weights from disk
+    elif pipe_class != pipe.__class__ or not isinstance(pipe.scheduler, scheduler):
+        pipe.components["scheduler"] = scheduler.from_pretrained(
+            model_id, subfolder="scheduler"
+        )
+        pipe = pipe_class(**pipe.components)
+    if device == "cuda":
+        pipe = pipe.to(device)
+        if _xformers_available:
+            pipe.enable_xformers_memory_efficient_attention()
+            print("using xformers")
+        if low_vram_mode:
+            pipe.enable_attention_slicing()
+            print("using attention slicing to lower VRAM")
+    return pipe
+pipe = None
+loaded_model_id = ""
+pipe = load_pipe(model_ids[0], default_scheduler)
+def pad_image(image):
+    w, h = image.size
+    if w == h:
+        return image
+    elif w > h:
+        new_image = Image.new(image.mode, (w, w), (0, 0, 0))
+        new_image.paste(image, (0, (w - h) // 2))
+        return new_image
+    else:
+        new_image = Image.new(image.mode, (h, h), (0, 0, 0))
+        new_image.paste(image, ((h - w) // 2, 0))
+        return new_image
+@torch.no_grad()
+def generate(
+    model_name,
+    scheduler_name,
+    prompt,
+    guidance,
+    steps,
+    n_images=1,
+    width=512,
+    height=512,
+    seed=0,
+    image=None,
+    strength=0.5,
+    inpaint_image=None,
+    inpaint_strength=0.5,
+    inpaint_radio="",
+    neg_prompt="",
+    tab_index=1,
+    pipe_kwargs="{}",
+    progress=gr.Progress(track_tqdm=True),
+):
+    if seed == -1:
+        seed = random.randint(0, 2147483647)
+    generator = torch.Generator(device).manual_seed(seed)
+    pipe = load_pipe(
+        model_id=model_name,
+        scheduler_name=scheduler_name,
+        tab_index=tab_index,
+        pipe_kwargs=pipe_kwargs,
+    )
+    status_message = f"Prompt: '{prompt}' | Seed: {seed} | Guidance: {guidance} | Scheduler: {scheduler_name} | Steps: {steps}"
+    if tab_index == 1:
+        status_message = "Text to Image " + status_message
+        result = pipe(
+            prompt,
+            negative_prompt=neg_prompt,
+            num_images_per_prompt=n_images,
+            num_inference_steps=int(steps),
+            guidance_scale=guidance,
+            width=width,
+            height=height,
+            generator=generator,
+        )
+    elif tab_index == 2:
+        status_message = "Image to Image " + status_message
+        print(image.size)
+        image = image.resize((width, height))
+        print(image.size)
+        result = pipe(
+            prompt,
+            negative_prompt=neg_prompt,
+            num_images_per_prompt=n_images,
+            image=image,
+            num_inference_steps=int(steps),
+            strength=strength,
+            guidance_scale=guidance,
+            generator=generator,
+        )
+    elif tab_index == 3:
+        status_message = "Inpainting " + status_message
+        init_image = inpaint_image["image"].resize((width, height))
+        mask = inpaint_image["mask"].resize((width, height))
+        result = pipe(
+            prompt,
+            negative_prompt=neg_prompt,
+            num_images_per_prompt=n_images,
+            image=init_image,
+            mask_image=mask,
+            num_inference_steps=int(steps),
+            strength=inpaint_strength,
+            preserve_unmasked_image=(
+                inpaint_radio == "preserve non-masked portions of image"
+            ),
+            guidance_scale=guidance,
+            generator=generator,
+        )
+    else:
+        return None, f"Unhandled tab index: {tab_index}"
+    return result.images, status_message
+# based on lvkaokao/textual-inversion-training
+def train_textual_inversion(
+    model_name,
+    scheduler_name,
+    type_of_thing,
+    files,
+    concept_word,
+    init_word,
+    text_train_steps,
+    text_train_bsz,
+    text_learning_rate,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if device == "cpu":
+        raise gr.Error("Textual inversion training not supported on CPU")
+    pipe = load_pipe(
+        model_id=model_name,
+        scheduler_name=scheduler_name,
+        tab_index=1,
+    )
+    pipe.disable_xformers_memory_efficient_attention()  # xformers handled by textual inversion script
+    concept_dir = "concept_images"
+    output_dir = "output_model"
+    training_resolution = 512
+    if os.path.exists(output_dir):
+        shutil.rmtree("output_model")
+    if os.path.exists(concept_dir):
+        shutil.rmtree("concept_images")
+    os.makedirs(concept_dir, exist_ok=True)
+    os.makedirs(output_dir, exist_ok=True)
+    gc.collect()
+    torch.cuda.empty_cache()
+    if concept_word == "" or concept_word == None:
+        raise gr.Error("You forgot to define your concept prompt")
+    for j, file_temp in enumerate(files):
+        file = Image.open(file_temp.name)
+        image = pad_image(file)
+        image = image.resize((training_resolution, training_resolution))
+        extension = file_temp.name.split(".")[1]
+        image = image.convert("RGB")
+        image.save(f"{concept_dir}/{j+1}.{extension}", quality=100)
+    args_general = argparse.Namespace(
+        train_data_dir=concept_dir,
+        learnable_property=type_of_thing,
+        placeholder_token=concept_word,
+        initializer_token=init_word,
+        resolution=training_resolution,
+        train_batch_size=text_train_bsz,
+        gradient_accumulation_steps=1,
+        gradient_checkpointing=True,
+        mixed_precision="fp16",
+        use_bf16=False,
+        max_train_steps=int(text_train_steps),
+        learning_rate=text_learning_rate,
+        scale_lr=True,
+        lr_scheduler="constant",
+        lr_warmup_steps=0,
+        output_dir=output_dir,
+    )
+    try:
+        final_result = run_textual_inversion(pipe, args_general)
+    except Exception as e:
+        raise gr.Error(e)
+    pipe.text_encoder = pipe.text_encoder.eval().to(device, dtype=dtype)
+    pipe.unet = pipe.unet.eval().to(device, dtype=dtype)
+    gc.collect()
+    torch.cuda.empty_cache()
+    return (
+        f"Finished training! Check the {output_dir} directory for saved model weights"
+    )

inpaint_pipeline.py → utils/inpaint_pipeline.py RENAMED Viewed

@@ -1,4 +1,3 @@
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
@@ -16,21 +15,30 @@ from typing import Optional, Union, List, Callable
 import PIL
 import numpy as np
-from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint_legacy import preprocess_image, deprecate, StableDiffusionInpaintPipelineLegacy, StableDiffusionPipelineOutput, PIL_INTERPOLATION
 def preprocess_mask(mask, scale_factor=8):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
-    #input_mask = mask.resize((w, h), resample=PIL_INTERPOLATION["nearest"])
     input_mask = np.array(mask).astype(np.float32) / 255.0
     input_mask = np.tile(input_mask, (3, 1, 1))
     input_mask = input_mask[None].transpose(0, 1, 2, 3)  # add batch dimension
     input_mask = 1 - input_mask  # repaint white, keep black
     input_mask = torch.round(torch.from_numpy(input_mask))
-    mask = mask.resize((w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"])
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # add batch dimension
@@ -40,7 +48,6 @@ def preprocess_mask(mask, scale_factor=8):
     return mask, input_mask
 class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
     # forward call is same as StableDiffusionInpaintPipelineLegacy, but with line added to avoid noise added to final latents right before decoding step
@@ -120,8 +127,8 @@ class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
             preserve_unmasked_image (`bool`, *optional*, defaults to `True`):
-                Whether or not to preserve the unmasked portions of the original image in the inpainted output. If False,
-                inpainting of the masked latents may produce noticeable distortion of unmasked portions of the decoded
                 image.
         Returns:
@@ -148,7 +155,11 @@ class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
         # 3. Encode input prompt
         text_embeddings = self._encode_prompt(
-            prompt, device, num_images_per_prompt, do_classifier_free_guidance, negative_prompt
         )
         # 4. Preprocess image and mask
@@ -157,17 +168,27 @@ class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
         # get mask corresponding to input latents as well as image
         if not isinstance(mask_image, torch.FloatTensor):
-            mask_image, input_mask_image = preprocess_mask(mask_image, self.vae_scale_factor)
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
-        timesteps, num_inference_steps = self.get_timesteps(num_inference_steps, strength, device)
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
         # 6. Prepare latent variables
         # encode the init image into latents and scale the latents
         latents, init_latents_orig, noise = self.prepare_latents(
-            image, latent_timestep, batch_size, num_images_per_prompt, text_embeddings.dtype, device, generator
         )
         # 7. Prepare mask latent
@@ -181,33 +202,47 @@ class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
-                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
-                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
                 # predict the noise residual
-                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
-                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
                 # compute the previous noisy sample x_t -> x_t-1
-                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
                 # masking
                 if add_predicted_noise:
                     init_latents_proper = self.scheduler.add_noise(
                         init_latents_orig, noise_pred_uncond, torch.tensor([t])
                     )
                 else:
-                    init_latents_proper = self.scheduler.add_noise(init_latents_orig, noise, torch.tensor([t]))
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
                 # call the callback, if provided
-                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
@@ -225,7 +260,9 @@ class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
             # restore unmasked parts of image with original image
             input_mask_image = input_mask_image.to(inpaint_image)
             image = image.to(inpaint_image)
-            image = (image * input_mask_image) + (inpaint_image * (1 - input_mask_image)) # use original unmasked portions of image to avoid degradation
             # post-processing of image
             image = (image / 2 + 0.5).clamp(0, 1)
@@ -235,7 +272,9 @@ class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
             image = self.decode_latents(latents)
         # 11. Run safety checker
-        image, has_nsfw_concept = self.run_safety_checker(image, device, text_embeddings.dtype)
         # 12. Convert to PIL
         if output_type == "pil":
@@ -244,4 +283,6 @@ class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
         if not return_dict:
             return (image, has_nsfw_concept)
-        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)

 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 import PIL
 import numpy as np
+from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_inpaint_legacy import (
+    preprocess_image,
+    deprecate,
+    StableDiffusionInpaintPipelineLegacy,
+    StableDiffusionPipelineOutput,
+    PIL_INTERPOLATION,
+)
 def preprocess_mask(mask, scale_factor=8):
     mask = mask.convert("L")
     w, h = mask.size
     w, h = map(lambda x: x - x % 32, (w, h))  # resize to integer multiple of 32
+    # input_mask = mask.resize((w, h), resample=PIL_INTERPOLATION["nearest"])
     input_mask = np.array(mask).astype(np.float32) / 255.0
     input_mask = np.tile(input_mask, (3, 1, 1))
     input_mask = input_mask[None].transpose(0, 1, 2, 3)  # add batch dimension
     input_mask = 1 - input_mask  # repaint white, keep black
     input_mask = torch.round(torch.from_numpy(input_mask))
+    mask = mask.resize(
+        (w // scale_factor, h // scale_factor), resample=PIL_INTERPOLATION["nearest"]
+    )
     mask = np.array(mask).astype(np.float32) / 255.0
     mask = np.tile(mask, (4, 1, 1))
     mask = mask[None].transpose(0, 1, 2, 3)  # add batch dimension
     return mask, input_mask
 class SDInpaintPipeline(StableDiffusionInpaintPipelineLegacy):
     # forward call is same as StableDiffusionInpaintPipelineLegacy, but with line added to avoid noise added to final latents right before decoding step
                 The frequency at which the `callback` function will be called. If not specified, the callback will be
                 called at every step.
             preserve_unmasked_image (`bool`, *optional*, defaults to `True`):
+                Whether or not to preserve the unmasked portions of the original image in the inpainted output. If False,
+                inpainting of the masked latents may produce noticeable distortion of unmasked portions of the decoded
                 image.
         Returns:
         # 3. Encode input prompt
         text_embeddings = self._encode_prompt(
+            prompt,
+            device,
+            num_images_per_prompt,
+            do_classifier_free_guidance,
+            negative_prompt,
         )
         # 4. Preprocess image and mask
         # get mask corresponding to input latents as well as image
         if not isinstance(mask_image, torch.FloatTensor):
+            mask_image, input_mask_image = preprocess_mask(
+                mask_image, self.vae_scale_factor
+            )
         # 5. set timesteps
         self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps, num_inference_steps = self.get_timesteps(
+            num_inference_steps, strength, device
+        )
         latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
         # 6. Prepare latent variables
         # encode the init image into latents and scale the latents
         latents, init_latents_orig, noise = self.prepare_latents(
+            image,
+            latent_timestep,
+            batch_size,
+            num_images_per_prompt,
+            text_embeddings.dtype,
+            device,
+            generator,
         )
         # 7. Prepare mask latent
         num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
         with self.progress_bar(total=num_inference_steps) as progress_bar:
             for i, t in enumerate(timesteps):
                 # expand the latents if we are doing classifier free guidance
+                latent_model_input = (
+                    torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                )
+                latent_model_input = self.scheduler.scale_model_input(
+                    latent_model_input, t
+                )
                 # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=text_embeddings
+                ).sample
                 # perform guidance
                 if do_classifier_free_guidance:
                     noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (
+                        noise_pred_text - noise_pred_uncond
+                    )
                 # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(
+                    noise_pred, t, latents, **extra_step_kwargs
+                ).prev_sample
                 # masking
                 if add_predicted_noise:
                     init_latents_proper = self.scheduler.add_noise(
                         init_latents_orig, noise_pred_uncond, torch.tensor([t])
                     )
                 else:
+                    init_latents_proper = self.scheduler.add_noise(
+                        init_latents_orig, noise, torch.tensor([t])
+                    )
                 latents = (init_latents_proper * mask) + (latents * (1 - mask))
                 # call the callback, if provided
+                if i == len(timesteps) - 1 or (
+                    (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0
+                ):
                     progress_bar.update()
                     if callback is not None and i % callback_steps == 0:
                         callback(i, t, latents)
             # restore unmasked parts of image with original image
             input_mask_image = input_mask_image.to(inpaint_image)
             image = image.to(inpaint_image)
+            image = (image * input_mask_image) + (
+                inpaint_image * (1 - input_mask_image)
+            )  # use original unmasked portions of image to avoid degradation
             # post-processing of image
             image = (image / 2 + 0.5).clamp(0, 1)
             image = self.decode_latents(latents)
         # 11. Run safety checker
+        image, has_nsfw_concept = self.run_safety_checker(
+            image, device, text_embeddings.dtype
+        )
         # 12. Convert to PIL
         if output_type == "pil":
         if not return_dict:
             return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(
+            images=image, nsfw_content_detected=has_nsfw_concept
+        )

utils/shared.py ADDED Viewed

	@@ -0,0 +1,16 @@

+import diffusers.schedulers
+# scheduler dict includes superclass SchedulerMixin (it still generates reasonable images)
+scheduler_dict = {
+    k: v
+    for k, v in diffusers.schedulers.__dict__.items()
+    if "Scheduler" in k and "Flax" not in k
+}
+scheduler_dict.pop(
+    "VQDiffusionScheduler"
+)  # requires unique parameter, unlike other schedulers
+scheduler_names = list(scheduler_dict.keys())
+default_scheduler = scheduler_names[3]  # expected to be DPM Multistep
+with open("model_ids.txt", "r") as fp:
+    model_ids = fp.read().splitlines()

textual_inversion.py → utils/textual_inversion.py RENAMED Viewed

@@ -34,7 +34,12 @@ import transformers
 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
-from diffusers import AutoencoderKL, DDPMScheduler, StableDiffusionPipeline, UNet2DConditionModel
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
@@ -76,7 +81,11 @@ logger = get_logger(__name__)
 def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
     logger.info("Saving embeddings")
-    learned_embeds = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[placeholder_token_id]
     learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
     torch.save(learned_embeds_dict, save_path)
@@ -114,7 +123,10 @@ def parse_args():
         help="Pretrained tokenizer name or path if not the same as model_name",
     )
     parser.add_argument(
-        "--train_data_dir", type=str, default=None, help="A folder containing the training data."
     )
     parser.add_argument(
         "--placeholder_token",
@@ -123,18 +135,33 @@ def parse_args():
         help="A token to use as a placeholder for the concept.",
     )
     parser.add_argument(
-        "--initializer_token", type=str, default=None, help="A token to use as initializer word."
     )
-    parser.add_argument("--learnable_property", type=str, default="object", help="Choose between 'object' and 'style'")
-    parser.add_argument("--repeats", type=int, default=100, help="How many times to repeat the training data.")
     parser.add_argument(
         "--output_dir",
         type=str,
         default="text-inversion-model",
         help="The output directory where the model predictions and checkpoints will be written.",
     )
-    parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
     parser.add_argument(
         "--resolution",
         type=int,
@@ -145,10 +172,15 @@ def parse_args():
         ),
     )
     parser.add_argument(
-        "--center_crop", action="store_true", help="Whether to center crop images before resizing to resolution"
     )
     parser.add_argument(
-        "--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
     )
     parser.add_argument("--num_train_epochs", type=int, default=100)
     parser.add_argument(
@@ -190,14 +222,43 @@ def parse_args():
         ),
     )
     parser.add_argument(
-        "--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
     )
-    parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
-    parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
-    parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
-    parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
-    parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
     parser.add_argument(
         "--hub_model_id",
         type=str,
@@ -241,7 +302,12 @@ def parse_args():
             ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
         ),
     )
-    parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
     parser.add_argument(
         "--checkpointing_steps",
         type=int,
@@ -261,7 +327,9 @@ def parse_args():
         ),
     )
     parser.add_argument(
-        "--enable_xformers_memory_efficient_attention", action="store_true", help="Whether or not to use xformers."
     )
     args = parser.parse_args()
@@ -269,7 +337,7 @@ def parse_args():
     if env_local_rank != -1 and env_local_rank != args.local_rank:
         args.local_rank = env_local_rank
-    #if args.train_data_dir is None:
     #    raise ValueError("You must specify a train data directory.")
     return args
@@ -350,7 +418,10 @@ class TextualInversionDataset(Dataset):
         self.center_crop = center_crop
         self.flip_p = flip_p
-        self.image_paths = [os.path.join(self.data_root, file_path) for file_path in os.listdir(self.data_root)]
         self.num_images = len(self.image_paths)
         self._length = self.num_images
@@ -365,7 +436,11 @@ class TextualInversionDataset(Dataset):
             "lanczos": PIL_INTERPOLATION["lanczos"],
         }[interpolation]
-        self.templates = imagenet_style_templates_small if learnable_property == "style" else imagenet_templates_small
         self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
     def __len__(self):
@@ -394,14 +469,13 @@ class TextualInversionDataset(Dataset):
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
-            (
-                h,
-                w,
-            ) = (
                 img.shape[0],
                 img.shape[1],
             )
-            img = img[(h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2]
         image = Image.fromarray(img)
         image = image.resize((self.size, self.size), resample=self.interpolation)
@@ -414,7 +488,9 @@ class TextualInversionDataset(Dataset):
         return example
-def get_full_repo_name(model_id: str, organization: Optional[str] = None, token: Optional[str] = None):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
@@ -424,7 +500,6 @@ def get_full_repo_name(model_id: str, organization: Optional[str] = None, token:
         return f"{organization}/{model_id}"
 def main(pipe, args_imported):
     args = parse_args()
@@ -464,11 +539,15 @@ def main(pipe, args_imported):
     if accelerator.is_main_process:
         if args.push_to_hub:
             if args.hub_model_id is None:
-                repo_name = get_full_repo_name(Path(args.output_dir).name, token=args.hub_token)
             else:
                 repo_name = args.hub_model_id
             create_repo(repo_name, exist_ok=True, token=args.hub_token)
-            repo = Repository(args.output_dir, clone_from=repo_name, token=args.hub_token)
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
@@ -530,7 +609,9 @@ def main(pipe, args_imported):
         if is_xformers_available():
             unet.enable_xformers_memory_efficient_attention()
         else:
-            raise ValueError("xformers is not available. Make sure it is installed correctly")
     # Enable TF32 for faster training on Ampere GPUs,
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
@@ -539,7 +620,10 @@ def main(pipe, args_imported):
     if args.scale_lr:
         args.learning_rate = (
-            args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
         )
     # Initialize the optimizer
@@ -562,11 +646,15 @@ def main(pipe, args_imported):
         center_crop=args.center_crop,
         set="train",
     )
-    train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=args.train_batch_size, shuffle=True)
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
@@ -597,7 +685,9 @@ def main(pipe, args_imported):
     text_encoder.to(accelerator.device, dtype=torch.float32)
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
-    num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
     if overrode_max_train_steps:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
@@ -609,13 +699,19 @@ def main(pipe, args_imported):
         accelerator.init_trackers("textual_inversion", config=vars(args))
     # Train!
-    total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num Epochs = {args.num_train_epochs}")
     logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
-    logger.info(f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
     logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
     global_step = 0
@@ -640,31 +736,51 @@ def main(pipe, args_imported):
         resume_step = resume_global_step % num_update_steps_per_epoch
     # Only show the progress bar once on each machine.
-    progress_bar = tqdm(range(global_step, args.max_train_steps), disable=not accelerator.is_local_main_process)
     progress_bar.set_description("Steps")
     # keep original embeddings as reference
-    orig_embeds_params = accelerator.unwrap_model(text_encoder).get_input_embeddings().weight.data.clone()
-    for epoch in (range(first_epoch, args.num_train_epochs)):
         text_encoder.train()
         for step, batch in enumerate(train_dataloader):
             # Skip steps until we reach the resumed step
-            if args.resume_from_checkpoint and epoch == first_epoch and step < resume_step:
                 if step % args.gradient_accumulation_steps == 0:
                     progress_bar.update(1)
                 continue
             with accelerator.accumulate(text_encoder):
                 # Convert images to latent space
-                latents = vae.encode(batch["pixel_values"].to(dtype=weight_dtype)).latent_dist.sample().detach()
                 latents = latents * 0.18215
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
                 # Sample a random timestep for each image
-                timesteps = torch.randint(0, noise_scheduler.config.num_train_timesteps, (bsz,), device=latents.device)
                 timesteps = timesteps.long()
                 # Add noise to the latents according to the noise magnitude at each timestep
@@ -672,10 +788,14 @@ def main(pipe, args_imported):
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
                 # Get the text embedding for conditioning
-                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(dtype=weight_dtype)
                 # Predict the noise residual
-                model_pred = unet(noisy_latents, timesteps, encoder_hidden_states).sample
                 # Get the target for loss depending on the prediction type
                 if noise_scheduler.config.prediction_type == "epsilon":
@@ -683,7 +803,9 @@ def main(pipe, args_imported):
                 elif noise_scheduler.config.prediction_type == "v_prediction":
                     target = noise_scheduler.get_velocity(latents, noise, timesteps)
                 else:
-                    raise ValueError(f"Unknown prediction type {noise_scheduler.config.prediction_type}")
                 loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
@@ -694,8 +816,12 @@ def main(pipe, args_imported):
                 else:
                     grads = text_encoder.get_input_embeddings().weight.grad
                 # Get the index for tokens that we want to zero the grads for
-                index_grads_to_zero = torch.arange(len(tokenizer)) != placeholder_token_id
-                grads.data[index_grads_to_zero, :] = grads.data[index_grads_to_zero, :].fill_(0)
                 optimizer.step()
                 lr_scheduler.step()
@@ -704,21 +830,31 @@ def main(pipe, args_imported):
                 # Let's make sure we don't update any embedding weights besides the newly added token
                 index_no_updates = torch.arange(len(tokenizer)) != placeholder_token_id
                 with torch.no_grad():
-                    accelerator.unwrap_model(text_encoder).get_input_embeddings().weight[
                         index_no_updates
-                    ] = orig_embeds_params[index_no_updates]
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
                 progress_bar.update(1)
                 global_step += 1
                 if global_step % args.save_steps == 0:
-                    save_path = os.path.join(args.output_dir, f"{args.placeholder_token}-{global_step}.bin")
-                    save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
                 if global_step % args.checkpointing_steps == 0:
                     if accelerator.is_main_process:
-                        save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
                         accelerator.save_state(save_path)
                         logger.info(f"Saved state to {save_path}")
@@ -733,7 +869,9 @@ def main(pipe, args_imported):
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
         if args.push_to_hub and args.only_save_embeds:
-            logger.warn("Enabling full model saving because --push_to_hub=True was specified.")
             save_full_model = True
         else:
             save_full_model = not args.only_save_embeds
@@ -744,35 +882,35 @@ def main(pipe, args_imported):
         save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
         if args.push_to_hub:
-            repo.push_to_hub(commit_message="End of training", blocking=False, auto_lfs_prune=True)
     accelerator.end_training()
-    text_encoder.eval()
-    unet.eval()
-    vae.eval()
 if __name__ == "__main__":
-    pipeline = StableDiffusionPipeline.from_pretrained('andite/anything-v4.0', torch_dtype=torch.float16)
-    imported_args = argparse.Namespace(
-            train_data_dir="concept_images",
-            learnable_property='object',
-            placeholder_token='redeyegirl',
-            initializer_token='girl',
-            resolution=512,
-            train_batch_size=1,
-            gradient_accumulation_steps=1,
-            gradient_checkpointing=True,
-            mixed_precision='fp16',
-            use_bf16=False,
-            max_train_steps=1000,
-            learning_rate=5.0e-4,
-            scale_lr=False,
-            lr_scheduler="constant",
-            lr_warmup_steps=0,
-            output_dir="output_model",
-        )
     main(pipeline, imported_args)

 from accelerate import Accelerator
 from accelerate.logging import get_logger
 from accelerate.utils import set_seed
+from diffusers import (
+    AutoencoderKL,
+    DDPMScheduler,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
 from diffusers.optimization import get_scheduler
 from diffusers.utils import check_min_version
 from diffusers.utils.import_utils import is_xformers_available
 def save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path):
     logger.info("Saving embeddings")
+    learned_embeds = (
+        accelerator.unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight[placeholder_token_id]
+    )
     learned_embeds_dict = {args.placeholder_token: learned_embeds.detach().cpu()}
     torch.save(learned_embeds_dict, save_path)
         help="Pretrained tokenizer name or path if not the same as model_name",
     )
     parser.add_argument(
+        "--train_data_dir",
+        type=str,
+        default=None,
+        help="A folder containing the training data.",
     )
     parser.add_argument(
         "--placeholder_token",
         help="A token to use as a placeholder for the concept.",
     )
     parser.add_argument(
+        "--initializer_token",
+        type=str,
+        default=None,
+        help="A token to use as initializer word.",
     )
+    parser.add_argument(
+        "--learnable_property",
+        type=str,
+        default="object",
+        help="Choose between 'object' and 'style'",
+    )
+    parser.add_argument(
+        "--repeats",
+        type=int,
+        default=100,
+        help="How many times to repeat the training data.",
+    )
     parser.add_argument(
         "--output_dir",
         type=str,
         default="text-inversion-model",
         help="The output directory where the model predictions and checkpoints will be written.",
     )
+    parser.add_argument(
+        "--seed", type=int, default=None, help="A seed for reproducible training."
+    )
     parser.add_argument(
         "--resolution",
         type=int,
         ),
     )
     parser.add_argument(
+        "--center_crop",
+        action="store_true",
+        help="Whether to center crop images before resizing to resolution",
     )
     parser.add_argument(
+        "--train_batch_size",
+        type=int,
+        default=16,
+        help="Batch size (per device) for the training dataloader.",
     )
     parser.add_argument("--num_train_epochs", type=int, default=100)
     parser.add_argument(
         ),
     )
     parser.add_argument(
+        "--lr_warmup_steps",
+        type=int,
+        default=500,
+        help="Number of steps for the warmup in the lr scheduler.",
+    )
+    parser.add_argument(
+        "--adam_beta1",
+        type=float,
+        default=0.9,
+        help="The beta1 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--adam_beta2",
+        type=float,
+        default=0.999,
+        help="The beta2 parameter for the Adam optimizer.",
+    )
+    parser.add_argument(
+        "--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use."
+    )
+    parser.add_argument(
+        "--adam_epsilon",
+        type=float,
+        default=1e-08,
+        help="Epsilon value for the Adam optimizer",
+    )
+    parser.add_argument(
+        "--push_to_hub",
+        action="store_true",
+        help="Whether or not to push the model to the Hub.",
+    )
+    parser.add_argument(
+        "--hub_token",
+        type=str,
+        default=None,
+        help="The token to use to push to the Model Hub.",
     )
     parser.add_argument(
         "--hub_model_id",
         type=str,
             ' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
         ),
     )
+    parser.add_argument(
+        "--local_rank",
+        type=int,
+        default=-1,
+        help="For distributed training: local_rank",
+    )
     parser.add_argument(
         "--checkpointing_steps",
         type=int,
         ),
     )
     parser.add_argument(
+        "--enable_xformers_memory_efficient_attention",
+        action="store_true",
+        help="Whether or not to use xformers.",
     )
     args = parser.parse_args()
     if env_local_rank != -1 and env_local_rank != args.local_rank:
         args.local_rank = env_local_rank
+    # if args.train_data_dir is None:
     #    raise ValueError("You must specify a train data directory.")
     return args
         self.center_crop = center_crop
         self.flip_p = flip_p
+        self.image_paths = [
+            os.path.join(self.data_root, file_path)
+            for file_path in os.listdir(self.data_root)
+        ]
         self.num_images = len(self.image_paths)
         self._length = self.num_images
             "lanczos": PIL_INTERPOLATION["lanczos"],
         }[interpolation]
+        self.templates = (
+            imagenet_style_templates_small
+            if learnable_property == "style"
+            else imagenet_templates_small
+        )
         self.flip_transform = transforms.RandomHorizontalFlip(p=self.flip_p)
     def __len__(self):
         if self.center_crop:
             crop = min(img.shape[0], img.shape[1])
+            (h, w,) = (
                 img.shape[0],
                 img.shape[1],
             )
+            img = img[
+                (h - crop) // 2 : (h + crop) // 2, (w - crop) // 2 : (w + crop) // 2
+            ]
         image = Image.fromarray(img)
         image = image.resize((self.size, self.size), resample=self.interpolation)
         return example
+def get_full_repo_name(
+    model_id: str, organization: Optional[str] = None, token: Optional[str] = None
+):
     if token is None:
         token = HfFolder.get_token()
     if organization is None:
         return f"{organization}/{model_id}"
 def main(pipe, args_imported):
     args = parse_args()
     if accelerator.is_main_process:
         if args.push_to_hub:
             if args.hub_model_id is None:
+                repo_name = get_full_repo_name(
+                    Path(args.output_dir).name, token=args.hub_token
+                )
             else:
                 repo_name = args.hub_model_id
             create_repo(repo_name, exist_ok=True, token=args.hub_token)
+            repo = Repository(
+                args.output_dir, clone_from=repo_name, token=args.hub_token
+            )
             with open(os.path.join(args.output_dir, ".gitignore"), "w+") as gitignore:
                 if "step_*" not in gitignore:
         if is_xformers_available():
             unet.enable_xformers_memory_efficient_attention()
         else:
+            raise ValueError(
+                "xformers is not available. Make sure it is installed correctly"
+            )
     # Enable TF32 for faster training on Ampere GPUs,
     # cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
     if args.scale_lr:
         args.learning_rate = (
+            args.learning_rate
+            * args.gradient_accumulation_steps
+            * args.train_batch_size
+            * accelerator.num_processes
         )
     # Initialize the optimizer
         center_crop=args.center_crop,
         set="train",
     )
+    train_dataloader = torch.utils.data.DataLoader(
+        train_dataset, batch_size=args.train_batch_size, shuffle=True
+    )
     # Scheduler and math around the number of training steps.
     overrode_max_train_steps = False
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
     if args.max_train_steps is None:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
         overrode_max_train_steps = True
     text_encoder.to(accelerator.device, dtype=torch.float32)
     # We need to recalculate our total training steps as the size of the training dataloader may have changed.
+    num_update_steps_per_epoch = math.ceil(
+        len(train_dataloader) / args.gradient_accumulation_steps
+    )
     if overrode_max_train_steps:
         args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
     # Afterwards we recalculate our number of training epochs
         accelerator.init_trackers("textual_inversion", config=vars(args))
     # Train!
+    total_batch_size = (
+        args.train_batch_size
+        * accelerator.num_processes
+        * args.gradient_accumulation_steps
+    )
     logger.info("***** Running training *****")
     logger.info(f"  Num examples = {len(train_dataset)}")
     logger.info(f"  Num Epochs = {args.num_train_epochs}")
     logger.info(f"  Instantaneous batch size per device = {args.train_batch_size}")
+    logger.info(
+        f"  Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}"
+    )
     logger.info(f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
     logger.info(f"  Total optimization steps = {args.max_train_steps}")
     global_step = 0
         resume_step = resume_global_step % num_update_steps_per_epoch
     # Only show the progress bar once on each machine.
+    progress_bar = tqdm(
+        range(global_step, args.max_train_steps),
+        disable=not accelerator.is_local_main_process,
+    )
     progress_bar.set_description("Steps")
     # keep original embeddings as reference
+    orig_embeds_params = (
+        accelerator.unwrap_model(text_encoder)
+        .get_input_embeddings()
+        .weight.data.clone()
+    )
+    for epoch in range(first_epoch, args.num_train_epochs):
         text_encoder.train()
         for step, batch in enumerate(train_dataloader):
             # Skip steps until we reach the resumed step
+            if (
+                args.resume_from_checkpoint
+                and epoch == first_epoch
+                and step < resume_step
+            ):
                 if step % args.gradient_accumulation_steps == 0:
                     progress_bar.update(1)
                 continue
             with accelerator.accumulate(text_encoder):
                 # Convert images to latent space
+                latents = (
+                    vae.encode(batch["pixel_values"].to(dtype=weight_dtype))
+                    .latent_dist.sample()
+                    .detach()
+                )
                 latents = latents * 0.18215
                 # Sample noise that we'll add to the latents
                 noise = torch.randn_like(latents)
                 bsz = latents.shape[0]
                 # Sample a random timestep for each image
+                timesteps = torch.randint(
+                    0,
+                    noise_scheduler.config.num_train_timesteps,
+                    (bsz,),
+                    device=latents.device,
+                )
                 timesteps = timesteps.long()
                 # Add noise to the latents according to the noise magnitude at each timestep
                 noisy_latents = noise_scheduler.add_noise(latents, noise, timesteps)
                 # Get the text embedding for conditioning
+                encoder_hidden_states = text_encoder(batch["input_ids"])[0].to(
+                    dtype=weight_dtype
+                )
                 # Predict the noise residual
+                model_pred = unet(
+                    noisy_latents, timesteps, encoder_hidden_states
+                ).sample
                 # Get the target for loss depending on the prediction type
                 if noise_scheduler.config.prediction_type == "epsilon":
                 elif noise_scheduler.config.prediction_type == "v_prediction":
                     target = noise_scheduler.get_velocity(latents, noise, timesteps)
                 else:
+                    raise ValueError(
+                        f"Unknown prediction type {noise_scheduler.config.prediction_type}"
+                    )
                 loss = F.mse_loss(model_pred.float(), target.float(), reduction="mean")
                 else:
                     grads = text_encoder.get_input_embeddings().weight.grad
                 # Get the index for tokens that we want to zero the grads for
+                index_grads_to_zero = (
+                    torch.arange(len(tokenizer)) != placeholder_token_id
+                )
+                grads.data[index_grads_to_zero, :] = grads.data[
+                    index_grads_to_zero, :
+                ].fill_(0)
                 optimizer.step()
                 lr_scheduler.step()
                 # Let's make sure we don't update any embedding weights besides the newly added token
                 index_no_updates = torch.arange(len(tokenizer)) != placeholder_token_id
                 with torch.no_grad():
+                    accelerator.unwrap_model(
+                        text_encoder
+                    ).get_input_embeddings().weight[
+                        index_no_updates
+                    ] = orig_embeds_params[
                         index_no_updates
+                    ]
             # Checks if the accelerator has performed an optimization step behind the scenes
             if accelerator.sync_gradients:
                 progress_bar.update(1)
                 global_step += 1
                 if global_step % args.save_steps == 0:
+                    save_path = os.path.join(
+                        args.output_dir, f"{args.placeholder_token}-{global_step}.bin"
+                    )
+                    save_progress(
+                        text_encoder, placeholder_token_id, accelerator, args, save_path
+                    )
                 if global_step % args.checkpointing_steps == 0:
                     if accelerator.is_main_process:
+                        save_path = os.path.join(
+                            args.output_dir, f"checkpoint-{global_step}"
+                        )
                         accelerator.save_state(save_path)
                         logger.info(f"Saved state to {save_path}")
     accelerator.wait_for_everyone()
     if accelerator.is_main_process:
         if args.push_to_hub and args.only_save_embeds:
+            logger.warn(
+                "Enabling full model saving because --push_to_hub=True was specified."
+            )
             save_full_model = True
         else:
             save_full_model = not args.only_save_embeds
         save_progress(text_encoder, placeholder_token_id, accelerator, args, save_path)
         if args.push_to_hub:
+            repo.push_to_hub(
+                commit_message="End of training", blocking=False, auto_lfs_prune=True
+            )
     accelerator.end_training()
 if __name__ == "__main__":
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        "andite/anything-v4.0", torch_dtype=torch.float16
+    )
+    imported_args = argparse.Namespace(
+        train_data_dir="concept_images",
+        learnable_property="object",
+        placeholder_token="redeyegirl",
+        initializer_token="girl",
+        resolution=512,
+        train_batch_size=1,
+        gradient_accumulation_steps=1,
+        gradient_checkpointing=True,
+        mixed_precision="fp16",
+        use_bf16=False,
+        max_train_steps=1000,
+        learning_rate=5.0e-4,
+        scale_lr=False,
+        lr_scheduler="constant",
+        lr_warmup_steps=0,
+        output_dir="output_model",
+    )
     main(pipeline, imported_args)