from diffusers import (
  ControlNetModel,
  StableDiffusionImg2ImgPipeline,
  StableDiffusionControlNetImg2ImgPipeline,
)
from compel import Compel
from PIL import Image
import cv2
import gc
import gradio
import numpy
import torch

base_model = "SimianLuo/LCM_Dreamshaper_v7"
controlnet_model = "lllyasviel/control_v11p_sd15_canny"
device = "cuda"
dtype = torch.float16
width = 512
height = 512

controlnet = ControlNetModel.from_pretrained(
  controlnet_model, tourch_dtype=dtype
)

pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
  base_model, controlnet=controlnet, safety_checker=None
).to(dtype=dtype)
pipe.enable_model_cpu_offload(device=device)
pipe.unet.to(memory_format=torch.channels_last)

compel_proc = Compel(
  tokenizer=pipe.tokenizer,
  text_encoder=pipe.text_encoder,
  truncate_long_prompts=False,
)

pipe_no_controlnet = StableDiffusionImg2ImgPipeline.from_pretrained(
  base_model, safety_checker=None
).to(dtype=dtype)
pipe.enable_model_cpu_offload(device=device)
pipe_no_controlnet.enable_model_cpu_offload()

compel_proc_no_controlnet = Compel(
  tokenizer=pipe_no_controlnet.tokenizer,
  text_encoder=pipe_no_controlnet.text_encoder,
  truncate_long_prompts=False,
)

def predict(
  prompt: str,
  image: Image,
  use_controlnet: bool,
  generator: int,
  num_inference_steps: int,
  strength: float,
  guidance_scale: float,
  controlnet_conditioning_scale: float,
  canny_lower_threshold: int,
  canny_higher_threshold: int,
):
  if image is None:
    return None

  generator = torch.manual_seed(generator)
  # TODO: Keep the original ratio?
  image = image.resize((width, height))

  if use_controlnet:
    prompt_embeds = compel_proc(prompt)
    image_array = numpy.array(image)
    image_array = cv2.Canny(
      image_array,
      canny_lower_threshold,
      canny_higher_threshold
    )
    image_array = image_array[:, :, None]
    image_array = numpy.concatenate([image_array, image_array, image_array], axis=2)
    control_image = Image.fromarray(image_array)
    results = pipe(
      control_image=control_image,
      control_guidance_end=1.0,
      control_guidance_start=0.0,
      controlnet_conditioning_scale=controlnet_conditioning_scale,
      generator=generator,
      guidance_scale=guidance_scale,
      image=image,
      num_inference_steps=num_inference_steps,
      output_type="pil",
      prompt_embeds=prompt_embeds,
      strength=strength,
    )
    control_image.close()
  else:
    prompt_embeds = compel_proc_no_controlnet(prompt)
    results = pipe_no_controlnet(
      generator=generator,
      guidance_scale=guidance_scale,
      image=image,
      num_inference_steps=num_inference_steps,
      output_type="pil",
      prompt_embeds=prompt_embeds,
      strength=strength,
    )

  gc.collect()

  if len(results.images) > 0:
    return results.images[0]
  return None

app = gradio.Interface(
  fn=predict,
  inputs=[
    gradio.Textbox("Kirisame Marisa, Cute, Smiling, High quality, Realistic"), # prompt
    gradio.Image(type="pil"), # image
    gradio.Checkbox(True), # use_controlnet
    gradio.Slider(0, 2147483647, 2159232, step=1), # generator
    gradio.Slider(2, 15, 4, step=1), # num_inference_steps
    gradio.Slider(0.0, 1.0, 0.5, step=0.01), # strength
    gradio.Slider(0.0, 5.0, 0.2, step=0.01), # guidance_scale
    gradio.Slider(0.0, 1.0, 0.8, step=0.01), # controlnet_conditioning_scale
    gradio.Slider(0, 255, 100, step=1), # canny_lower_threshold
    gradio.Slider(0, 255, 200, step=1), # canny_higher_threshold
  ],
  outputs=gradio.Image(type="pil")
)
app.launch()