Spaces:

mvp-lab
/

Canny_ControlNet

Sleeping

File size: 14,181 Bytes

3d2b58b

import os
from dataclasses import dataclass
from PIL import Image

import cv2
import numpy as np
import gradio as gr
import torch

import spaces  # type: ignore

from huggingface_hub import hf_hub_download
from safetensors.torch import load_file

from diffusers.models.autoencoders.autoencoder_kl import AutoencoderKL
from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel
from diffusers.models.controlnets.controlnet import ControlNetModel
from diffusers.pipelines.controlnet.pipeline_controlnet import StableDiffusionControlNetPipeline
from diffusers.schedulers.scheduling_unipc_multistep import UniPCMultistepScheduler
from transformers import CLIPTextModel, CLIPTokenizer

BIG_CSS = """

/* Global bump */

.gradio-container {

  font-size: 18px !important;

}



/* Force most UI text bigger */

.gradio-container * {

  font-size: 18px !important;

}



/* Keep markdown headings bigger */

.gradio-container h1 { font-size: 28px !important; }

.gradio-container h2 { font-size: 24px !important; }

.gradio-container h3 { font-size: 20px !important; }



/* Slightly smaller helper/info text if you want */

.gradio-container .info,

.gradio-container .prose p,

.gradio-container .prose li {

  font-size: 16px !important;

  line-height: 1.35 !important;

}

"""

# -----------------------------
# Pipeline builder
# -----------------------------
def build_controlnet_pipe(

    base_model_name: str,

    controlnet: ControlNetModel,

    vae: AutoencoderKL,

    unet: UNet2DConditionModel,

    text_encoder: CLIPTextModel,

    tokenizer: CLIPTokenizer,

    device: torch.device,

    weight_dtype: torch.dtype,

    use_unipc: bool = True,

) -> StableDiffusionControlNetPipeline:
    pipe = StableDiffusionControlNetPipeline.from_pretrained(
        base_model_name,
        vae=vae,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        unet=unet,
        controlnet=controlnet,
        safety_checker=None,
        torch_dtype=weight_dtype,
    )
    if use_unipc:
        pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
    pipe = pipe.to(device)
    pipe.set_progress_bar_config(disable=True)
    return pipe

@dataclass
class CannyCFG:
    use_clahe: bool = True
    clahe_clip: float = 2.0
    clahe_grid: int = 8

    gaussian_ksize: int = 5
    gaussian_sigma: float = 1.2

    high_pct: float = 90.0     # higher -> fewer edges (stricter)
    low_ratio: float = 0.4     # low = low_ratio * high

    aperture_size: int = 3
    l2_gradient: bool = True


def canny_percentile(pil_img: Image.Image, cfg: CannyCFG) -> Image.Image:
    gray = np.array(pil_img.convert("L"), dtype=np.uint8)

    if cfg.use_clahe:
        clahe = cv2.createCLAHE(
            clipLimit=float(cfg.clahe_clip),
            tileGridSize=(int(cfg.clahe_grid), int(cfg.clahe_grid)),
        )
        gray = clahe.apply(gray)

    k = int(cfg.gaussian_ksize) | 1  # ensure odd
    blur = cv2.GaussianBlur(gray, (k, k), float(cfg.gaussian_sigma))

    gx = cv2.Sobel(blur, cv2.CV_32F, 1, 0, ksize=3)
    gy = cv2.Sobel(blur, cv2.CV_32F, 0, 1, ksize=3)
    mag = cv2.magnitude(gx, gy)

    high = float(np.percentile(mag, float(cfg.high_pct)))
    low = float(cfg.low_ratio) * high
    if high <= low:
        high = low + 1.0

    ap = int(cfg.aperture_size)
    if ap not in (3, 5, 7):
        ap = 3

    edges = cv2.Canny(
        blur,
        threshold1=low,
        threshold2=high,
        apertureSize=ap,
        L2gradient=bool(cfg.l2_gradient),
    )
    return Image.fromarray(edges, mode="L")


# -----------------------------
# Config
# -----------------------------
BASE_MODEL = "sd-legacy/stable-diffusion-v1-5"
WEIGHTS_REPO = "mvp-lab/ControlNet_Weight"
WEIGHTS_FILENAME = "diffusion_pytorch_model_1.safetensors"

LOCAL_WEIGHTS = os.getenv(
    "CONTROLNET_WEIGHTS",
    "/home/nik/ImperialWork/GenerativeAi/sd15-controlnet-trainer/controlnet_laion/final/diffusion_pytorch_model.safetensors",
)
if os.path.isfile(LOCAL_WEIGHTS):
    CONTROLNET_PATH = LOCAL_WEIGHTS
else:
    CONTROLNET_PATH = hf_hub_download(repo_id=WEIGHTS_REPO, filename=WEIGHTS_FILENAME, repo_type="model")

DTYPE = torch.float32
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")


# -----------------------------
# Model load (once)
# -----------------------------
vae = AutoencoderKL.from_pretrained(BASE_MODEL, subfolder="vae", torch_dtype=DTYPE)
unet = UNet2DConditionModel.from_pretrained(BASE_MODEL, subfolder="unet", torch_dtype=DTYPE)
tokenizer = CLIPTokenizer.from_pretrained(BASE_MODEL, subfolder="tokenizer")
text_encoder = CLIPTextModel.from_pretrained(BASE_MODEL, subfolder="text_encoder", torch_dtype=DTYPE)

vae.requires_grad_(False)
unet.requires_grad_(False)
text_encoder.requires_grad_(False)

controlnet = ControlNetModel.from_unet(unet, conditioning_channels=3)
state = load_file(CONTROLNET_PATH)
missing, unexpected = controlnet.load_state_dict(state, strict=False)

pipe = build_controlnet_pipe(
    base_model_name=BASE_MODEL,
    controlnet=controlnet,
    vae=vae,
    unet=unet,
    text_encoder=text_encoder,
    tokenizer=tokenizer,
    device=DEVICE,
    weight_dtype=DTYPE,
    use_unipc=True,
)


# -----------------------------
# Helpers: fixed resize policy (longest side = 512, keep aspect, divisible by 8)
# -----------------------------
def round_down_to_multiple(x: int, m: int = 8) -> int:
    return max(m, (x // m) * m)

def resize_longest_side_div8(img: Image.Image, longest: int = 512) -> tuple[Image.Image, int, int]:
    w, h = img.size
    if w <= 0 or h <= 0:
        raise ValueError("Invalid image size")

    scale = float(longest) / float(max(w, h))
    tw = int(round(w * scale))
    th = int(round(h * scale))

    tw = round_down_to_multiple(tw, 8)
    th = round_down_to_multiple(th, 8)

    tw = max(8, tw)
    th = max(8, th)

    resized = img.resize((tw, th), resample=Image.BICUBIC) # type: ignore
    return resized, tw, th

def compute_canny_rgb(img_rgb_resized: Image.Image, use_clahe: bool, edge_amount: float, smoothing: float) -> Image.Image:
    high_pct = 95.0 - 20.0 * float(edge_amount)  # 0 => 95 (few), 1 => 75 (many)
    high_pct = float(np.clip(high_pct, 70.0, 99.0))

    gaussian_sigma = 0.6 + 2.2 * float(smoothing)  # 0 => 0.6, 1 => 2.8

    cfg = CannyCFG(
        use_clahe=bool(use_clahe),
        clahe_clip=2.0,
        clahe_grid=8,
        gaussian_ksize=5,
        gaussian_sigma=float(gaussian_sigma),
        high_pct=float(high_pct),
        low_ratio=0.4,
        aperture_size=3,
        l2_gradient=True,
    )
    edges_l = canny_percentile(img_rgb_resized, cfg)
    return edges_l.convert("RGB")

def update_canny_preview(input_image, use_clahe, edge_amount, smoothing):
    if input_image is None:
        return None, None, 512, 512

    if not isinstance(input_image, Image.Image):
        input_image = Image.fromarray(input_image)

    img_rgb0 = input_image.convert("RGB")
    img_rgb, width, height = resize_longest_side_div8(img_rgb0, longest=512)

    canny = compute_canny_rgb(
        img_rgb,
        use_clahe=use_clahe,
        edge_amount=float(edge_amount),
        smoothing=float(smoothing),
    )
    return canny, canny, width, height


@spaces.GPU
@torch.inference_mode()
def generate_from_canny(

    canny: Image.Image,

    width: int,

    height: int,

    prompt: str,

    negative_prompt: str,

    guidance_scale: float,

    num_inference_steps: int,

    num_images: int,

    controlnet_conditioning_scale: float,

):
    if canny is None:
        raise gr.Error("Canny conditioning image missing. Upload an image first.")
    if int(num_images) < 1:
        raise gr.Error("num_images must be >= 1")

    gens = [torch.Generator(device=DEVICE).manual_seed(i) for i in range(int(num_images))]

    imgs = pipe(
        prompt=[prompt] * int(num_images),
        negative_prompt=[negative_prompt] * int(num_images),
        image=[canny] * int(num_images),
        num_inference_steps=int(num_inference_steps),
        guidance_scale=float(guidance_scale),
        height=int(height),
        width=int(width),
        generator=gens,
        controlnet_conditioning_scale=float(controlnet_conditioning_scale),
    ).images # type: ignore

    first = imgs[0] if imgs else None
    return first, imgs

def next_image(images, idx):
    if not images:
        return None, 0, "0 / 0"
    idx = (int(idx) + 1) % len(images)
    return images[idx], idx, f"{idx + 1} / {len(images)}"

def prev_image(images, idx):
    if not images:
        return None, 0, "0 / 0"
    idx = (int(idx) - 1) % len(images)
    return images[idx], idx, f"{idx + 1} / {len(images)}"


# -----------------------------
# UI
# -----------------------------
IMG_H = 360  # uniform-ish size for both preview boxes

with gr.Blocks(css=BIG_CSS) as demo:
    gr.Markdown("# Canny-Edge ControlNet Demo")
    gr.Markdown("**Note:** Trained on aesthetic/artistic images — best results come from similar, stylised inputs.")

    # state
    canny_state = gr.State(None)
    width_state = gr.State(512)
    height_state = gr.State(512)

    gen_images_state = gr.State([])  # list[PIL]
    gen_index_state = gr.State(0)

    with gr.Row():
        # ---- Left: Canny + Canny controls ----
        with gr.Column(scale=1):
            input_image = gr.Image(
                label="Input Image",
                type="pil",
                image_mode="RGB",
                height=IMG_H,
            )

            canny_preview = gr.Image(
                label="Canny edges",
                type="pil",
                height=IMG_H,
            )

            gr.Markdown("### Edge controls")
            use_clahe = gr.Checkbox(
                label="Stabilise contrast (CLAHE)",
                value=True,
                info="Helps edges stay consistent under different lighting/contrast.",
            )
            edge_amount = gr.Slider(
                label="Edge Amount",
                minimum=0.0, maximum=1.0, value=0.6, step=0.01,
                info="More = detect more edges (more detail). Less = cleaner outline.",
            )
            smoothing = gr.Slider(
                label="Smoothing",
                minimum=0.0, maximum=1.0, value=0.4, step=0.01,
                info="More = reduce tiny texture/noise edges, cleaner structure.",
            )

        # ---- Right: Generated output + generation controls ----
        with gr.Column(scale=1):
            generated = gr.Image(
                label="Generated image",
                type="pil",
                height=IMG_H,
            )

            with gr.Row():
                prev_btn = gr.Button("◀ Prev")
                page_label = gr.Markdown("0 / 0")
                next_btn = gr.Button("Next ▶")

            gr.Markdown("### Generation controls")
            positive_prompt = gr.Textbox(
                label="Positive Prompt",
                value="",
                lines=2,
                info="Describe what you want. The edges guide the structure.",
            )
            negative_prompt = gr.Textbox(
                label="Negative Prompt",
                value="",
                lines=2,
                info="Things to avoid (e.g. blurry, deformed, low quality).",
            )

            with gr.Row():
                guidance_scale = gr.Slider(
                    label="Guidance Scale",
                    minimum=1.0, maximum=15.0, value=7.5, step=0.1,
                    info="Higher = follow text prompt more strongly (can drift from edges).",
                )
                controlnet_conditioning_scale = gr.Slider(
                    label="Control Strength",
                    minimum=0.0, maximum=2.0, value=1.0, step=0.05,
                    info="Higher = follow edges more strongly. Too high can reduce creativity.",
                )

            with gr.Row():
                num_inference_steps = gr.Slider(
                    label="Steps",
                    minimum=10, maximum=80, value=50, step=1,
                    info="More steps can improve quality but is slower.",
                )
                num_images = gr.Slider(
                    label="Samples",
                    minimum=1, maximum=8, value=4, step=1,
                    info="How many images to generate.",
                )

            run_btn = gr.Button("Generate", variant="primary")

    # Auto-update Canny preview on changes (CPU)
    auto_inputs = [input_image, use_clahe, edge_amount, smoothing]
    for c in auto_inputs:
        c.change(
            fn=update_canny_preview,
            inputs=auto_inputs,
            outputs=[canny_preview, canny_state, width_state, height_state],
        )

    # Generate (GPU) -> store list -> show first -> update paging label
    run_btn.click(
        fn=generate_from_canny,
        inputs=[
            canny_state,
            width_state,
            height_state,
            positive_prompt,
            negative_prompt,
            guidance_scale,
            num_inference_steps,
            num_images,
            controlnet_conditioning_scale,
        ],
        outputs=[generated, gen_images_state],  # visible output first => proper "Generating..." UX
    ).then(
        fn=lambda imgs: (0, f"1 / {len(imgs)}") if imgs else (0, "0 / 0"),
        inputs=[gen_images_state],
        outputs=[gen_index_state, page_label],
    )

    # Paging buttons (CPU)
    next_btn.click(
        fn=next_image,
        inputs=[gen_images_state, gen_index_state],
        outputs=[generated, gen_index_state, page_label],
    )
    prev_btn.click(
        fn=prev_image,
        inputs=[gen_images_state, gen_index_state],
        outputs=[generated, gen_index_state, page_label],
    )

if __name__ == "__main__":
    demo.launch()