Spaces:

fantos
/

EveryText

Paused

App Files Files Community

fantos commited on Aug 28, 2024

Commit

f2a614f

verified ·

1 Parent(s): b4134a4

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -244

app.py CHANGED Viewed

@@ -5,7 +5,7 @@ import cv2
 import gradio as gr
 import numpy as np
 from huggingface_hub import snapshot_download
-from transformers import CLIPVisionModelWithProjection, CLIPImageProcessor, pipeline
 from diffusers.utils import load_image
 from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
 from kolors.models.modeling_chatglm import ChatGLMModel
@@ -15,16 +15,11 @@ from diffusers import AutoencoderKL
 from kolors.models.unet_2d_condition import UNet2DConditionModel
 from diffusers import EulerDiscreteScheduler
 from PIL import Image, ImageDraw, ImageFont
-from annotator.midas import MidasDetector
-from annotator.dwpose import DWposeDetector
-from annotator.util import resize_image, HWC3
 import os
 device = "cuda"
 ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
-ckpt_dir_depth = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Depth")
 ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
-ckpt_dir_pose = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Pose")
 # Add translation pipeline
 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
@@ -34,19 +29,7 @@ tokenizer = ChatGLMTokenizer.from_pretrained(f'{ckpt_dir}/text_encoder')
 vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
 scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
 unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
-controlnet_depth = ControlNetModel.from_pretrained(f"{ckpt_dir_depth}", revision=None).half().to(device)
 controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
-controlnet_pose = ControlNetModel.from_pretrained(f"{ckpt_dir_pose}", revision=None).half().to(device)
-pipe_depth = StableDiffusionXLControlNetImg2ImgPipeline(
-    vae=vae,
-    controlnet=controlnet_depth,
-    text_encoder=text_encoder,
-    tokenizer=tokenizer,
-    unet=unet,
-    scheduler=scheduler,
-    force_zeros_for_empty_prompt=False
-)
 pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
     vae=vae,
@@ -58,16 +41,6 @@ pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
     force_zeros_for_empty_prompt=False
 )
-pipe_pose = StableDiffusionXLControlNetImg2ImgPipeline(
-    vae=vae,
-    controlnet=controlnet_pose,
-    text_encoder=text_encoder,
-    tokenizer=tokenizer,
-    unet=unet,
-    scheduler=scheduler,
-    force_zeros_for_empty_prompt=False
-)
 @spaces.GPU
 def translate_korean_to_english(text):
     if any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in text):  # Check if Korean characters are present
@@ -84,150 +57,10 @@ def process_canny_condition(image, canny_threods=[100,200]):
     np_image = HWC3(np_image)
     return Image.fromarray(np_image)
-model_midas = MidasDetector()
-@spaces.GPU
-def process_depth_condition_midas(img, res = 1024):
-    h,w,_ = img.shape
-    img = resize_image(HWC3(img), res)
-    result = HWC3(model_midas(img))
-    result = cv2.resize(result, (w,h))
-    return Image.fromarray(result)
-model_dwpose = DWposeDetector()
-@spaces.GPU
-def process_dwpose_condition(image, res=1024):
-    h,w,_ = image.shape
-    img = resize_image(HWC3(image), res)
-    out_res, out_img = model_dwpose(image)
-    result = HWC3(out_img)
-    result = cv2.resize(result, (w,h))
-    return Image.fromarray(result)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
-@spaces.GPU
-def infer_depth(prompt,
-          image = None,
-          negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
-          seed = 397886929,
-          randomize_seed = False,
-          guidance_scale = 6.0,
-          num_inference_steps = 50,
-          controlnet_conditioning_scale = 0.7,
-          control_guidance_end = 0.9,
-          strength = 1.0
-        ):
-    prompt = translate_korean_to_english(prompt)
-    negative_prompt = translate_korean_to_english(negative_prompt)
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    init_image = resize_image(image, MAX_IMAGE_SIZE)
-    pipe = pipe_depth.to("cuda")
-    condi_img = process_depth_condition_midas(np.array(init_image), MAX_IMAGE_SIZE)
-    image = pipe(
-        prompt=prompt,
-        image=init_image,
-        controlnet_conditioning_scale=controlnet_conditioning_scale,
-        control_guidance_end=control_guidance_end,
-        strength=strength,
-        control_image=condi_img,
-        negative_prompt=negative_prompt,
-        num_inference_steps=num_inference_steps,
-        guidance_scale=guidance_scale,
-        num_images_per_prompt=1,
-        generator=generator,
-    ).images[0]
-    return [condi_img, image], seed
-@spaces.GPU
-def infer_canny(prompt,
-          image = None,
-          negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
-          seed = 397886929,
-          randomize_seed = False,
-          guidance_scale = 6.0,
-          num_inference_steps = 50,
-          controlnet_conditioning_scale = 0.7,
-          control_guidance_end = 0.9,
-          strength = 1.0
-        ):
-    prompt = translate_korean_to_english(prompt)
-    negative_prompt = translate_korean_to_english(negative_prompt)
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    init_image = resize_image(image, MAX_IMAGE_SIZE)
-    pipe = pipe_canny.to("cuda")
-    condi_img = process_canny_condition(np.array(init_image))
-    image = pipe(
-        prompt=prompt,
-        image=init_image,
-        controlnet_conditioning_scale=controlnet_conditioning_scale,
-        control_guidance_end=control_guidance_end,
-        strength=strength,
-        control_image=condi_img,
-        negative_prompt=negative_prompt,
-        num_inference_steps=num_inference_steps,
-        guidance_scale=guidance_scale,
-        num_images_per_prompt=1,
-        generator=generator,
-    ).images[0]
-    return [condi_img, image], seed
-@spaces.GPU
-def infer_pose(prompt,
-          image = None,
-          negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
-          seed = 66,
-          randomize_seed = False,
-          guidance_scale = 6.0,
-          num_inference_steps = 50,
-          controlnet_conditioning_scale = 0.7,
-          control_guidance_end = 0.9,
-          strength = 1.0
-        ):
-    prompt = translate_korean_to_english(prompt)
-    negative_prompt = translate_korean_to_english(negative_prompt)
-    if randomize_seed:
-        seed = random.randint(0, MAX_SEED)
-    generator = torch.Generator().manual_seed(seed)
-    init_image = resize_image(image, MAX_IMAGE_SIZE)
-    pipe = pipe_pose.to("cuda")
-    condi_img = process_dwpose_condition(np.array(init_image), MAX_IMAGE_SIZE)
-    image = pipe(
-        prompt=prompt,
-        image=init_image,
-        controlnet_conditioning_scale=controlnet_conditioning_scale,
-        control_guidance_end=control_guidance_end,
-        strength=strength,
-        control_image=condi_img,
-        negative_prompt=negative_prompt,
-        num_inference_steps=num_inference_steps,
-        guidance_scale=guidance_scale,
-        num_images_per_prompt=1,
-        generator=generator,
-    ).images[0]
-    return [condi_img, image], seed
-css = """
-footer {
-    visibility: hidden;
-}
-"""
-def load_description(fp):
-    with open(fp, 'r', encoding='utf-8') as f:
-        content = f.read()
-    return content
-# Add the text_to_image function
-def text_to_image(text, size, position):
     width, height = 1024, 576
     image = Image.new("RGB", (width, height), "white")
     draw = ImageDraw.Draw(image)
@@ -278,6 +111,51 @@ def text_to_image(text, size, position):
     return image
 with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
     with gr.Row():
         with gr.Column(elem_id="col-left"):
@@ -287,21 +165,6 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
                     placeholder="Enter your prompt",
                     lines=2
                 )
-            with gr.Row():
-                image_input_type = gr.Radio(["Upload Image", "Generate Text Image"], label="Input Type", value="Upload Image")
-            with gr.Row():
-                image = gr.Image(label="Image", type="pil", visible=True)
-                with gr.Column(visible=False) as text_image_inputs:
-                    text_input = gr.Textbox(label="Enter Text", lines=5, placeholder="Type your text here...")
-                    font_size = gr.Radio([48, 72, 96, 144], label="Font Size", value=72)
-                    text_position = gr.Dropdown(
-                        ["top-left", "top-center", "top-right", "middle-left", "middle-center", "middle-right", "bottom-left", "bottom-center", "bottom-right"],
-                        label="Text Position",
-                        value="middle-center"
-                    )
-                    generate_text_image = gr.Button("Generate Text Image")
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt = gr.Textbox(
                     label="Negative prompt",
@@ -357,73 +220,15 @@ with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
                     )
             with gr.Row():
                 canny_button = gr.Button("Canny", elem_id="button")
-                depth_button = gr.Button("Depth", elem_id="button")
-                pose_button = gr.Button("Pose", elem_id="button")
         with gr.Column(elem_id="col-right"):
             result = gr.Gallery(label="Result", show_label=False, columns=2)
             seed_used = gr.Number(label="Seed Used")
-    def toggle_image_input(choice):
-        return {
-            image: gr.update(visible=choice == "Upload Image"),
-            text_image_inputs: gr.update(visible=choice == "Generate Text Image")
-        }
-    image_input_type.change(toggle_image_input, image_input_type, [image, text_image_inputs])
-    def generate_and_use_text_image(text, size, position):
-        text_image = text_to_image(text, size, position)
-        return text_image
-    generate_text_image.click(
-        generate_and_use_text_image,
-        inputs=[text_input, font_size, text_position],
-        outputs=image
-    )
-    with gr.Row():
-        gr.Examples(
-                fn = infer_canny,
-                examples = canny_examples,
-                inputs = [prompt, image],
-                outputs = [result, seed_used],
-                label = "Canny"
-            )
-    with gr.Row():
-        gr.Examples(
-                fn = infer_depth,
-                examples = depth_examples,
-                inputs = [prompt, image],
-                outputs = [result, seed_used],
-                label = "Depth"
-            )
-    with gr.Row():
-        gr.Examples(
-                fn = infer_pose,
-                examples = pose_examples,
-                inputs = [prompt, image],
-                outputs = [result, seed_used],
-                label = "Pose"
-            )
     canny_button.click(
         fn = infer_canny,
-        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
-        outputs = [result, seed_used]
-    )
-    depth_button.click(
-        fn = infer_depth,
-        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
-        outputs = [result, seed_used]
-    )
-    pose_button.click(
-        fn = infer_pose,
-        inputs = [prompt, image, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
         outputs = [result, seed_used]
     )
-Kolors.queue().launch(debug=True)

 import gradio as gr
 import numpy as np
 from huggingface_hub import snapshot_download
+from transformers import pipeline
 from diffusers.utils import load_image
 from kolors.pipelines.pipeline_controlnet_xl_kolors_img2img import StableDiffusionXLControlNetImg2ImgPipeline
 from kolors.models.modeling_chatglm import ChatGLMModel
 from kolors.models.unet_2d_condition import UNet2DConditionModel
 from diffusers import EulerDiscreteScheduler
 from PIL import Image, ImageDraw, ImageFont
 import os
 device = "cuda"
 ckpt_dir = snapshot_download(repo_id="Kwai-Kolors/Kolors")
 ckpt_dir_canny = snapshot_download(repo_id="Kwai-Kolors/Kolors-ControlNet-Canny")
 # Add translation pipeline
 translator = pipeline("translation", model="Helsinki-NLP/opus-mt-ko-en")
 vae = AutoencoderKL.from_pretrained(f"{ckpt_dir}/vae", revision=None).half().to(device)
 scheduler = EulerDiscreteScheduler.from_pretrained(f"{ckpt_dir}/scheduler")
 unet = UNet2DConditionModel.from_pretrained(f"{ckpt_dir}/unet", revision=None).half().to(device)
 controlnet_canny = ControlNetModel.from_pretrained(f"{ckpt_dir_canny}", revision=None).half().to(device)
 pipe_canny = StableDiffusionXLControlNetImg2ImgPipeline(
     vae=vae,
     force_zeros_for_empty_prompt=False
 )
 @spaces.GPU
 def translate_korean_to_english(text):
     if any(ord(char) >= 0xAC00 and ord(char) <= 0xD7A3 for char in text):  # Check if Korean characters are present
     np_image = HWC3(np_image)
     return Image.fromarray(np_image)
 MAX_SEED = np.iinfo(np.int32).max
 MAX_IMAGE_SIZE = 1024
+def text_to_image(text, size=72, position="middle-center"):
     width, height = 1024, 576
     image = Image.new("RGB", (width, height), "white")
     draw = ImageDraw.Draw(image)
     return image
+@spaces.GPU
+def infer_canny(prompt,
+          negative_prompt = "nsfw, facial shadows, low resolution, jpeg artifacts, blurry, bad quality, dark face, neon lights",
+          seed = 397886929,
+          randomize_seed = False,
+          guidance_scale = 6.0,
+          num_inference_steps = 50,
+          controlnet_conditioning_scale = 0.7,
+          control_guidance_end = 0.9,
+          strength = 1.0
+        ):
+    prompt = translate_korean_to_english(prompt)
+    negative_prompt = translate_korean_to_english(negative_prompt)
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator().manual_seed(seed)
+    # Generate text image
+    init_image = text_to_image(prompt)
+    init_image = resize_image(init_image, MAX_IMAGE_SIZE)
+    pipe = pipe_canny.to("cuda")
+    condi_img = process_canny_condition(np.array(init_image))
+    image = pipe(
+        prompt=prompt,
+        image=init_image,
+        controlnet_conditioning_scale=controlnet_conditioning_scale,
+        control_guidance_end=control_guidance_end,
+        strength=strength,
+        control_image=condi_img,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        guidance_scale=guidance_scale,
+        num_images_per_prompt=1,
+        generator=generator,
+    ).images[0]
+    return [condi_img, image], seed
+css = """
+footer {
+    visibility: hidden;
+}
+"""
 with gr.Blocks(theme="Nymbo/Nymbo_Theme", css=css) as Kolors:
     with gr.Row():
         with gr.Column(elem_id="col-left"):
                     placeholder="Enter your prompt",
                     lines=2
                 )
             with gr.Accordion("Advanced Settings", open=False):
                 negative_prompt = gr.Textbox(
                     label="Negative prompt",
                     )
             with gr.Row():
                 canny_button = gr.Button("Canny", elem_id="button")
         with gr.Column(elem_id="col-right"):
             result = gr.Gallery(label="Result", show_label=False, columns=2)
             seed_used = gr.Number(label="Seed Used")
     canny_button.click(
         fn = infer_canny,
+        inputs = [prompt, negative_prompt, seed, randomize_seed, guidance_scale, num_inference_steps, controlnet_conditioning_scale, control_guidance_end, strength],
         outputs = [result, seed_used]
     )
+Kolors.queue().launch(debug=True)