Spaces:

soye
/

VISTA

Sleeping

File size: 30,343 Bytes

b97df79
61345be
6b3f441
61345be
 
 
 
 
 
 
 
 
 
 
b97df79
 
 
b5310ec
b97df79
dd06c4d
b97df79
dd06c4d
b97df79
 
 
1a497a0
b97df79
 
61345be
 
 
5488906
b97df79
 
 
 
 
 
5488906
b97df79
 
 
 
 
 
 
 
 
5488906
b97df79
5488906
b97df79
1a497a0
 
b97df79
 
 
 
 
 
 
 
 
61345be
 
 
 
 
5488906
61345be
 
 
 
 
 
 
 
 
b97df79
 
6b3f441
b5310ec
6b3f441
 
 
 
 
 
b5310ec
6b3f441
b5310ec
 
 
 
6b3f441
 
 
 
 
 
 
 
 
b5310ec
6b3f441
 
b97df79
 
 
5488906
 
b97df79
 
 
 
 
2cc3e3c
 
b97df79
 
 
 
 
 
 
 
 
c94fce0
7a4d0c5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4d41e0b
 
 
 
 
 
e476007
c94fce0
e476007
 
 
4d41e0b
 
 
 
 
cffceb3
8e7ec6d
df3b13f
 
 
2cc3e3c
8e7ec6d
df3b13f
 
 
8e7ec6d
 
 
 
 
df3b13f
 
 
7a4d0c5
8e7ec6d
df3b13f
8e7ec6d
df3b13f
 
 
 
8e7ec6d
df3b13f
 
 
8e7ec6d
 
 
 
 
 
 
df3b13f
 
 
b97df79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be92507
b97df79
 
 
 
 
 
 
 
 
be92507
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b97df79
 
 
be92507
 
 
 
 
b97df79
 
 
 
be92507
b97df79
 
 
e9ebd5a
b97df79
e9ebd5a
 
 
 
b97df79
e9ebd5a
 
7a4d0c5
e9ebd5a
d2a5241
e9ebd5a
 
 
 
 
 
 
c94fce0
09dba9b
 
 
 
 
9ffd8e3
09dba9b
9ffd8e3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
09dba9b
 
 
 
 
 
 
 
 
 
 
 
 
9ffd8e3
b97df79
 
 
1a497a0
 
09dba9b
1a497a0
09dba9b
 
 
9ffd8e3
09dba9b
 
 
 
9ffd8e3
09dba9b
 
 
 
1a497a0
7a4d0c5
9ffd8e3
 
 
 
39214ab
9ffd8e3
 
 
 
 
 
 
 
 
1a497a0
7a4d0c5
9ffd8e3
1a497a0
 
 
9ffd8e3
1a497a0
 
 
 
09dba9b
c94fce0
 
 
 
 
 
 
 
 
 
 
 
 
cffceb3
757f49a
c94fce0
757f49a
c94fce0
 
 
7a4d0c5
c94fce0
7a4d0c5
c94fce0
 
 
 
 
 
 
 
b97df79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ecb196
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ce9b543
15dfa12
b97df79
 
 
cffceb3
b97df79
61345be
 
15dfa12
 
b5130ce
 
15dfa12
 
 
 
b5130ce
15dfa12
 
 
b5130ce
15dfa12
 
 
 
 
 
b5130ce
15dfa12
 
b97df79
15dfa12
 
b97df79
15dfa12
b97df79
 
 
f4ac6fe
 
 
 
 
 
 
 
b97df79
 
 
 
61345be
 
b97df79
 
 
 
7a4d0c5
757f49a
 
 
 
 
 
 
2cc3e3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
800c0e5
a68cff5
2cc3e3c
 
cffceb3
b97df79
 
61345be
 
b97df79
 
757f49a
 
b97df79
757f49a
b97df79
61345be
b97df79
 
 
 
cffceb3
8e7ec6d
df3b13f
8e7ec6d
 
 
df3b13f
b97df79
be92507
 
 
 
2cc3e3c
c94fce0
 
 
 
cffceb3
c94fce0
6d019a0
b97df79
 
7a4d0c5
b97df79
 
7a4d0c5
 
b97df79
 
7a4d0c5
c94fce0
 
 
 
4d41e0b
7a4d0c5
c94fce0
7a4d0c5
 
4d41e0b
7a4d0c5
4d41e0b
b97df79
7a4d0c5
b97df79
c94fce0
b97df79
7a4d0c5
b97df79
 
2cc3e3c
 
 
 
 
5488906
 
 
 
 
 
757f49a
 
2cc3e3c
 
 
 
5488906
 
 
b97df79
 
61345be
 
b97df79
 
 
2cc3e3c
 
b97df79
 
 
5488906
ce9b543
ed13881
ce9b543
 
ed13881
ce9b543
 
 
2cc3e3c
 
 
 
 
 
 
 
 
 
3a7c299
 
2cc3e3c
b97df79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6ecb196
 
 
5488906
 
b97df79
6ecb196
b97df79
 
 
 
 
 
 
 
 
cffceb3
966466c
c94fce0
2cc3e3c
 
 
b97df79
cffceb3
 
 
b97df79
 
 
 
 
 
 
cffceb3
2cc3e3c
b97df79
 
5488906
 
cffceb3
5488906
b97df79
 
5488906
b97df79
 
 
81645a3
 
 
 
 
 
 
 
b97df79
cffceb3
757f49a
cffceb3
 
 
 
 
b5310ec
 
 
 
 
 
7a4d0c5
b97df79
b5310ec
 
 
 
 
 
 
2cc3e3c
b5310ec
 
 
 
 
2cc3e3c
b5310ec
 
 
 
2cc3e3c
b5310ec
 
 
 
 
 
cffceb3
c61e63a
cff4044
 
 
 
b5310ec
 
 
 
2cc3e3c
b5310ec
 
6b3f441
b97df79
b5310ec
 
 
2cc3e3c
b5310ec
 
b97df79
81645a3
b97df79
 
 
dd06c4d
daa7b33
 
 
 
 
 
 
 
 
5488906
b97df79
 
cffceb3
ad1dad9
dd06c4d
 
b97df79
5a89abc
61345be
be92507

import os
import sys
import glob

# ---------------------------------------------------------
# 0) Make sure local packages (diffusers3, preprocess, etc.) are importable on HF Spaces
# ---------------------------------------------------------
ROOT = os.path.dirname(os.path.abspath(__file__))
if ROOT not in sys.path:
    sys.path.insert(0, ROOT)

print("[BOOT] ROOT =", ROOT, flush=True)
print("[BOOT] sys.path[:5] =", sys.path[:5], flush=True)

import tempfile
from dataclasses import dataclass
from functools import lru_cache
from typing import Optional, Tuple, List, Dict

import gradio as gr
import torch
import numpy as np
import cv2
import imageio
from PIL import Image, ImageOps
from transformers import pipeline
from huggingface_hub import hf_hub_download

import diffusers3
print("[BOOT] diffusers3 loaded from:", getattr(diffusers3, "__file__", "<?>"), flush=True)

from diffusers import UniPCMultistepScheduler, AutoencoderKL, UNet2DConditionModel
from diffusers3.models.controlnet import ControlNetModel
from diffusers3.pipelines.controlnet.pipeline_controlnet_sd_xl_img2img_img import (
    StableDiffusionXLControlNetImg2ImgPipeline,
)
from ip_adapter import IPAdapterXL

# extractor
from preprocess.simple_extractor import run as run_simple_extractor


# =========================
# HF Hub repo ids
# =========================
BASE_MODEL_ID = "stabilityai/stable-diffusion-xl-base-1.0"
CONTROLNET_ID = "diffusers/controlnet-depth-sdxl-1.0"

# assets dataset repo
ASSETS_REPO = os.getenv("ASSETS_REPO", "soye/VISTA_assets")
ASSETS_REPO_TYPE = "dataset"

depth_estimator = pipeline("depth-estimation")


def asset_path(relpath: str) -> str:
    return hf_hub_download(
        repo_id=ASSETS_REPO,
        repo_type=ASSETS_REPO_TYPE,
        filename=relpath,
    )


@lru_cache(maxsize=1)
def get_assets():
    print("[ASSETS] Downloading assets from:", ASSETS_REPO, flush=True)

    image_encoder_weight = asset_path("image_encoder/model.safetensors")
    _ = asset_path("image_encoder/config.json")
    image_encoder_dir = os.path.dirname(image_encoder_weight)

    ip_ckpt = asset_path("ip_adapter/ip-adapter_sdxl_vit-h.bin")
    schp_ckpt = asset_path("preprocess_ckpts/exp-schp-201908301523-atr.pth")

    print("[ASSETS] image_encoder_dir =", image_encoder_dir, flush=True)
    print("[ASSETS] ip_ckpt =", ip_ckpt, flush=True)
    print("[ASSETS] schp_ckpt =", schp_ckpt, flush=True)
    return image_encoder_dir, ip_ckpt, schp_ckpt


# =========================
# Example assets for Gradio UI (✅ 분리형)
# =========================
def _is_image_file(p: str) -> bool:
    ext = os.path.splitext(p.lower())[1]
    return ext in (".png", ".jpg", ".jpeg", ".webp")


def build_ui_example_lists(root_dir: str = ROOT) -> Dict[str, List[str]]:
    """
    Returns dict of example filepaths:
      - persons: [{root}/examples/person/*]
      - styles : [{root}/examples/style/*]
      - sketches: [{root}/examples/sketch/*] (optional)
    """
    person_dir = os.path.join(root_dir, "examples", "person")
    style_dir = os.path.join(root_dir, "examples", "style")
    sketch_dir = os.path.join(root_dir, "examples", "sketch")

    persons = [p for p in sorted(glob.glob(os.path.join(person_dir, "*"))) if _is_image_file(p)]
    styles = [p for p in sorted(glob.glob(os.path.join(style_dir, "*"))) if _is_image_file(p)]
    sketches = [p for p in sorted(glob.glob(os.path.join(sketch_dir, "*"))) if _is_image_file(p)]

    return {"persons": persons, "styles": styles, "sketches": sketches}


DEFAULT_STEPS = 40
DEBUG_SAVE = False

H: Optional[int] = None
W: Optional[int] = None


@dataclass
class Paths:
    person_path: str
    depth_path: Optional[str]          # sketch(guide) optional
    style_path: Optional[str]          # ✅ style optional (변경)
    output_path: str


def _imread_or_raise(path: str, flag=cv2.IMREAD_COLOR):
    img = cv2.imread(path, flag)
    if img is None:
        raise FileNotFoundError(f"cv2.imread failed: {path} (exists={os.path.exists(path)})")
    return img


def _pad_or_crop_to_width_np(arr: np.ndarray, target_width: int, pad_value):
    """
    arr: HxWxC or HxW
    target_width로 center crop 또는 좌/우 padding(비대칭 포함)해서 정확히 맞춤.
    """
    if arr.ndim not in (2, 3):
        raise ValueError(f"arr must be 2D or 3D, got shape={arr.shape}")

    h = arr.shape[0]
    w = arr.shape[1]

    if w == target_width:
        return arr

    if w > target_width:
        left = (w - target_width) // 2
        return arr[:, left:left + target_width] if arr.ndim == 2 else arr[:, left:left + target_width, :]

    # w < target_width: pad
    total = target_width - w
    left = total // 2
    right = total - left  # ✅ remainder를 오른쪽이 먹어서 항상 정확히 target_width

    if arr.ndim == 2:
        return cv2.copyMakeBorder(
            arr, 0, 0, left, right,
            borderType=cv2.BORDER_CONSTANT,
            value=pad_value,
        )
    else:
        return cv2.copyMakeBorder(
            arr, 0, 0, left, right,
            borderType=cv2.BORDER_CONSTANT,
            value=pad_value,
        )


def apply_parsing_white_mask_to_person_cv2(
    person_pil: Image.Image,
    parsing_img: Image.Image
) -> np.ndarray:
    person_rgb = np.array(person_pil.convert("RGB"), dtype=np.uint8)
    mask = np.array(parsing_img.convert("L"), dtype=np.uint8)

    if mask.shape[:2] != person_rgb.shape[:2]:
        mask = cv2.resize(mask, (person_rgb.shape[1], person_rgb.shape[0]), interpolation=cv2.INTER_NEAREST)

    white_mask = (mask == 255)
    result_rgb = np.full_like(person_rgb, 255, dtype=np.uint8)
    result_rgb[white_mask] = person_rgb[white_mask]
    result_bgr = cv2.cvtColor(result_rgb, cv2.COLOR_RGB2BGR)
    return result_bgr


def remove_small_white_components(
    parsing_img: Image.Image,
    *,
    white_threshold: int = 128,
    min_white_area: int = 150,
    use_open: bool = False,
    open_ksize: int = 3,
    morph_iters: int = 1,
) -> Image.Image:
    """
    - 흰색(=foreground)으로 이진화
    - connected components로 '작은 흰색 덩어리'만 제거
    - (옵션) OPEN을 아주 약하게 적용해 작은 점/가시 제거 (흰색이 늘어나는 CLOSE는 사용 X)
    """
    if not isinstance(parsing_img, Image.Image):
        raise TypeError("parsing_img must be a PIL.Image.Image")

    arr = np.array(parsing_img.convert("L"), dtype=np.uint8)
    mask = np.where(arr >= int(white_threshold), 255, 0).astype(np.uint8)

    # 1) 작은 흰색 연결요소 제거
    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask, connectivity=8)
    keep = np.zeros_like(mask)
    for lab in range(1, num_labels):
        area = int(stats[lab, cv2.CC_STAT_AREA])
        if area >= int(min_white_area):
            keep[labels == lab] = 255
    mask = keep

    # 2) (옵션) OPEN: 작은 흰 점/가시 제거 + 경계 약간 정리 (흰색 증가 방향 아님)
    if use_open and int(open_ksize) > 1:
        k = int(open_ksize)
        if k % 2 == 0:
            k += 1
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (k, k))
        mask = cv2.morphologyEx(mask, cv2.MORPH_OPEN, kernel, iterations=int(morph_iters))

    return Image.fromarray(mask, mode="L")


def compute_hw_from_person(person_path: str):
    img = _imread_or_raise(person_path)
    orig_h, orig_w = img.shape[:2]
    scale = 1024.0 / float(orig_h)
    new_h = 1024
    new_w = int(round(orig_w * scale))
    if new_w > 1024:
        new_w = 1024
    return new_h, new_w


def fill_sketch_from_image_path_to_pil(image_path: str) -> Image.Image:
    global H, W
    if H is None or W is None:
        raise RuntimeError("Global H/W not set.")
    img = _imread_or_raise(image_path, cv2.IMREAD_GRAYSCALE)
    img = cv2.bitwise_not(img)
    img = cv2.resize(img, (W, H), interpolation=cv2.INTER_NEAREST)
    _, binary = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY_INV)
    contours, _ = cv2.findContours(binary, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    filled = np.zeros_like(binary)
    cv2.drawContours(filled, contours, -1, 255, thickness=cv2.FILLED)
    filled_rgb = cv2.cvtColor(filled, cv2.COLOR_GRAY2RGB)
    return Image.fromarray(filled_rgb)


def _resize_pil_nearest(img: Image.Image, size_wh: Tuple[int, int], *, force_mode: Optional[str] = None) -> Image.Image:
    """
    Resize PIL image to (W,H) using INTER_NEAREST (safe for masks).
    size_wh: (width, height)
    """
    w, h = int(size_wh[0]), int(size_wh[1])
    if force_mode is not None:
        img = img.convert(force_mode)
    arr = np.array(img, dtype=np.uint8)

    if arr.ndim == 2:
        resized = cv2.resize(arr, (w, h), interpolation=cv2.INTER_NEAREST)
        return Image.fromarray(resized, mode="L")
    elif arr.ndim == 3 and arr.shape[2] == 3:
        resized = cv2.resize(arr, (w, h), interpolation=cv2.INTER_NEAREST)
        return Image.fromarray(resized, mode="RGB")
    else:
        raise ValueError(f"Unsupported image array shape: {arr.shape}")


def merge_white_regions_or(img1: Image.Image, img2: Image.Image) -> Image.Image:
    a = np.array(img1.convert("RGB"), dtype=np.uint8)
    b = np.array(img2.convert("RGB"), dtype=np.uint8)

    # ✅ safety: make shapes identical to avoid numpy broadcasting error
    if a.shape[:2] != b.shape[:2]:
        b = cv2.resize(b, (a.shape[1], a.shape[0]), interpolation=cv2.INTER_NEAREST)

    white_a = np.all(a == 255, axis=-1)
    white_b = np.all(b == 255, axis=-1)
    out = a.copy()
    out[white_a | white_b] = 255
    return Image.fromarray(out, mode="RGB")


def preprocess_mask(mask_img: Image.Image) -> Image.Image:
    global H, W
    m = np.array(mask_img.convert("L"), dtype=np.uint8)

    if (H is not None) and (W is not None):
        m = cv2.resize(m, (W, H), interpolation=cv2.INTER_NEAREST)

    _, m = cv2.threshold(m, 127, 255, cv2.THRESH_BINARY)

    target_width = 1024
    m = _pad_or_crop_to_width_np(m, target_width, pad_value=0)

    kernel = np.ones((12, 12), np.uint8)
    m = cv2.dilate(m, kernel, iterations=1)

    if DEBUG_SAVE:
        cv2.imwrite("mask_final_1024.png", m)

    return Image.fromarray(m, mode="L").convert("RGB")


# def make_depth(depth_path: str) -> Image.Image:
#     global H, W
#     if H is None or W is None:
#         raise RuntimeError("Global H/W not set. Call run_one() first.")

#     depth_img = _imread_or_raise(depth_path, 0)  # grayscale

#     # (선택) 입력이 완전한 0/255가 아니라면 이진화로 고정
#     _, depth_bin = cv2.threshold(depth_img, 127, 255, cv2.THRESH_BINARY)

#     # 컨투어 채우기가 "두꺼워 보임"의 원인일 수도 있어, 유지/제거 선택 가능
#     # 1) 채우기 유지 (holes 메우는 목적이라면)
#     contours, _ = cv2.findContours(depth_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
#     filled_depth = np.zeros_like(depth_bin)
#     cv2.drawContours(filled_depth, contours, -1, 255, thickness=cv2.FILLED)

#     # 2) 채우기 제거하고 싶으면 위 3줄 대신 이걸 사용:
#     # filled_depth = depth_bin

#     # ✅ 마스크 리사이즈는 NEAREST (경계 번짐/팽창 느낌 방지)
#     filled_depth = cv2.resize(filled_depth, (W, H), interpolation=cv2.INTER_NEAREST)

#     # (선택) 리사이즈 후에도 0/255 강제
#     _, filled_depth = cv2.threshold(filled_depth, 127, 255, cv2.THRESH_BINARY)

#     filled_depth = _pad_or_crop_to_width_np(filled_depth, 1024, pad_value=0)

#     inverted_image = ImageOps.invert(Image.fromarray(filled_depth))

#     with torch.inference_mode():
#         image_depth = depth_estimator(inverted_image)["depth"]

#     if DEBUG_SAVE:
#         image_depth.save("depth.png")

#     return image_depth


def make_depth(depth_path: str) -> Image.Image:
    global H, W
    if H is None or W is None:
        raise RuntimeError("Global H/W not set. Call run_one() first.")

    depth_img = _imread_or_raise(depth_path, 0)  # grayscale

    # (선택) 입력이 완전한 0/255가 아니라면 이진화로 고정
    _, depth_bin = cv2.threshold(depth_img, 127, 255, cv2.THRESH_BINARY)

    # 컨투어 채우기 (holes 메우는 목적)
    contours, _ = cv2.findContours(depth_bin, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    filled_depth = np.zeros_like(depth_bin)
    cv2.drawContours(filled_depth, contours, -1, 255, thickness=cv2.FILLED)

    # ✅ 마스크 리사이즈는 NEAREST
    filled_depth = cv2.resize(filled_depth, (W, H), interpolation=cv2.INTER_NEAREST)

    # (선택) 리사이즈 후에도 0/255 강제
    _, filled_depth = cv2.threshold(filled_depth, 127, 255, cv2.THRESH_BINARY)

    filled_depth = _pad_or_crop_to_width_np(filled_depth, 1024, pad_value=0)
    
    

    # ✅ 여기서 침식(팽창의 반대): 흰색 영역을 조금 줄임
    erode_ksize = 5   # 3/5/7... (클수록 더 많이 줄어듦)
    erode_iters = 1   # 1~2 추천
    if erode_ksize is not None and erode_ksize > 1 and erode_iters > 0:
        if erode_ksize % 2 == 0:
            erode_ksize += 1
        kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (erode_ksize, erode_ksize))
        filled_depth = cv2.erode(filled_depth, kernel, iterations=erode_iters)
        # 안전하게 다시 이진화
        _, filled_depth = cv2.threshold(filled_depth, 127, 255, cv2.THRESH_BINARY)
    

    inverted_image = ImageOps.invert(Image.fromarray(filled_depth))
    

    with torch.inference_mode():
        image_depth = depth_estimator(inverted_image)["depth"]
        

    return image_depth



def _edges_from_parsing(parsing_img: Image.Image) -> np.ndarray:
    m = np.array(parsing_img.convert("L"), dtype=np.uint8)
    _, m_bin = cv2.threshold(m, 127, 255, cv2.THRESH_BINARY)
    edges = cv2.Canny(m_bin, 50, 150)
    edges = cv2.dilate(edges, np.ones((3, 3), np.uint8), iterations=1)
    return edges.astype(np.uint8)


def make_depth_from_parsing_edges(parsing_img: Image.Image) -> Image.Image:
    global H, W
    if H is None or W is None:
        raise RuntimeError("Global H/W not set. Call run_one() first.")

    depth_img = _edges_from_parsing(parsing_img)
    contours, _ = cv2.findContours(depth_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    filled_depth = depth_img.copy()
    cv2.drawContours(filled_depth, contours, -1, (255), thickness=cv2.FILLED)

    filled_depth = cv2.resize(filled_depth, (W, H), interpolation=cv2.INTER_AREA)
    filled_depth = _pad_or_crop_to_width_np(filled_depth, 1024, pad_value=0)

    inverted_image = ImageOps.invert(Image.fromarray(filled_depth))

    with torch.inference_mode():
        image_depth = depth_estimator(inverted_image)["depth"]

    if DEBUG_SAVE:
        image_depth.save("depth.png")

    return image_depth


def center_crop_lr_to_768x1024(arr: np.ndarray) -> np.ndarray:
    target_h, target_w = 1024, 768
    h, w = arr.shape[:2]
    if h != target_h:
        arr = cv2.resize(arr, (w, target_h), interpolation=cv2.INTER_AREA)
        h, w = arr.shape[:2]
    if w < target_w:
        pad = (target_w - w) // 2
        arr = cv2.copyMakeBorder(arr, 0, 0, pad, pad, cv2.BORDER_CONSTANT, value=[255, 255, 255])
        w = arr.shape[1]
    left = (w - target_w) // 2
    return arr[:, left:left + target_w]


def save_cropped(imgs, out_path: str):
    np_imgs = [np.asarray(im) for im in imgs]
    cropped = [center_crop_lr_to_768x1024(x) for x in np_imgs]
    out = np.concatenate(cropped, axis=1)
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    imageio.imsave(out_path, out)
    
def _read_hw(path: str) -> Tuple[int, int]:
    img = _imread_or_raise(path)  # BGR
    h, w = img.shape[:2]
    return h, w


def _center_crop_lr_to_aspect(arr: np.ndarray, target_aspect: float, *, pad_value=255) -> np.ndarray:
    """
    arr: HxWxC (RGB) or HxW
    target_aspect = target_w / target_h
    - 높이(H)는 유지
    - 좌/우를 동일 비율로 crop해서 target_aspect에 맞춤
    - 만약 현재 폭이 부족하면 좌/우 padding으로 맞춤
    """
    if arr.ndim == 2:
        arr = cv2.cvtColor(arr, cv2.COLOR_GRAY2RGB)

    h, w = arr.shape[:2]
    if h <= 0 or w <= 0:
        raise ValueError(f"Invalid image shape: {arr.shape}")

    desired_w = int(round(h * float(target_aspect)))
    if desired_w <= 0:
        desired_w = 1

    # 폭이 충분하면 좌/우 crop
    if w >= desired_w:
        left = (w - desired_w) // 2
        right = left + desired_w
        return arr[:, left:right]

    # 폭이 부족하면 좌/우 padding (요청은 crop이지만 안전장치)
    total = desired_w - w
    left_pad = total // 2
    right_pad = total - left_pad
    return cv2.copyMakeBorder(
        arr,
        0, 0,
        left_pad, right_pad,
        borderType=cv2.BORDER_CONSTANT,
        value=[pad_value, pad_value, pad_value],
    )


def save_output_match_person(imgs, out_path: str, person_path: str):
    """
    - 출력 imgs(보통 길이 1)를 person 원본 비율에 맞게 좌/우 center-crop
    - person 원본 (W,H)로 resize
    - (imgs가 여러 장이면) 처리 후 가로로 concat해서 저장
    """
    person_h, person_w = _read_hw(person_path)
    target_aspect = float(person_w) / float(person_h)

    np_imgs = []
    for im in imgs:
        if isinstance(im, Image.Image):
            arr = np.asarray(im.convert("RGB"), dtype=np.uint8)
        else:
            # 혹시 numpy가 들어오는 경우 대비
            arr = np.asarray(im, dtype=np.uint8)
            if arr.ndim == 2:
                arr = cv2.cvtColor(arr, cv2.COLOR_GRAY2RGB)

        cropped = _center_crop_lr_to_aspect(arr, target_aspect, pad_value=255)
        resized = cv2.resize(cropped, (person_w, person_h), interpolation=cv2.INTER_AREA)
        np_imgs.append(resized)

    out = np.concatenate(np_imgs, axis=1)  # imgs가 1장이면 그대로
    os.makedirs(os.path.dirname(out_path), exist_ok=True)
    imageio.imsave(out_path, out)



@lru_cache(maxsize=1)
def get_pipe_and_device() -> Tuple[StableDiffusionXLControlNetImg2ImgPipeline, str, torch.dtype]:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    dtype = torch.float32

    print(f"[PIPE] device={device}, dtype={dtype}", flush=True)

    controlnet = ControlNetModel.from_pretrained(
        CONTROLNET_ID,
        torch_dtype=dtype,
        use_safetensors=True,
    ).to(device)

    vae = AutoencoderKL.from_pretrained(
        BASE_MODEL_ID,
        subfolder="vae",
        torch_dtype=dtype,
        use_safetensors=True,
    ).to(device)

    unet = UNet2DConditionModel.from_pretrained(
        BASE_MODEL_ID,
        subfolder="unet",
        torch_dtype=dtype,
        use_safetensors=True,
    ).to(device)

    pipe = StableDiffusionXLControlNetImg2ImgPipeline.from_pretrained(
        BASE_MODEL_ID,
        controlnet=controlnet,
        vae=vae,
        unet=unet,
        torch_dtype=dtype,
        use_safetensors=True,
        add_watermarker=False,
    ).to(device)

    if device == "cuda":
        try:
            pipe.vae.to(dtype=dtype)
            if hasattr(pipe.vae, "config") and hasattr(pipe.vae.config, "force_upcast"):
                pipe.vae.config.force_upcast = False
        except Exception as e:
            print("[PIPE] VAE dtype cast failed:", repr(e), flush=True)

    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
    pipe.enable_attention_slicing()
    try:
        pipe.enable_xformers_memory_efficient_attention()
    except Exception as e:
        print("[PIPE] xformers not enabled:", repr(e), flush=True)

    return pipe, device, dtype


# UI 표기 → 내부 extractor category 문자열 매핑
_UI_TO_EXTRACTOR_CATEGORY = {
    "Upper-body": "Upper-cloth",
    "Lower-body": "Bottom",
    "Dress": "Dress",
}


def _has_valid_file(path: Optional[str]) -> bool:
    return (
        path is not None
        and isinstance(path, str)
        and len(path) > 0
        and os.path.exists(path)
    )


def _resolve_content_style_scales(style_present: bool, prompt_present: bool) -> Tuple[float, float]:
    """
    요구사항:
    - style image 없으면: (0.0, 0.0)
    - prompt 없으면: (0.4, 0.6)
    - 둘 다 있으면: 기존 유지 (0.3, 0.5)
    """
    if not style_present:
        return 0.0, 0.0
    if not prompt_present:
        return 0.35, 0.65
    return 0.25, 0.5


def run_one(paths: Paths, prompt: str, steps: int = DEFAULT_STEPS, category: str = "Dress"):
    global H, W
    pipe, device, _dtype = get_pipe_and_device()
    image_encoder_dir, ip_ckpt, schp_ckpt = get_assets()

    H, W = compute_hw_from_person(paths.person_path)

    extractor_category = _UI_TO_EXTRACTOR_CATEGORY.get(category, "Dress")

    res = run_simple_extractor(
        category=extractor_category,
        input_path=os.path.abspath(paths.person_path),
        model_restore=schp_ckpt,
    )
    parsing_img = res["images"][0] if res.get("images") else None
    if parsing_img is None:
        raise RuntimeError("run_simple_extractor returned no parsing images.")

    parsing_img = remove_small_white_components(
        parsing_img,
        white_threshold=128,
        min_white_area=150,   # 데이터에 맞게 30~200 사이 조절
        use_open=False,
    )

    # ✅ IMPORTANT: extractor output size can differ from (W,H). Align before OR-merge.
    if parsing_img.size != (W, H):
        parsing_img = _resize_pil_nearest(parsing_img, (W, H), force_mode="L")

    use_depth_path = _has_valid_file(paths.depth_path)

    if use_depth_path:
        sketch_area = fill_sketch_from_image_path_to_pil(paths.depth_path)
    else:
        sketch_area = parsing_img.convert("RGB")

    merged_img = merge_white_regions_or(parsing_img, sketch_area)
    mask_pil = preprocess_mask(merged_img)

    # person
    person_bgr = _imread_or_raise(paths.person_path)
    person_bgr = cv2.resize(person_bgr, (W, H), interpolation=cv2.INTER_AREA)
    person_bgr = _pad_or_crop_to_width_np(person_bgr, 1024, pad_value=[255, 255, 255])
    person_rgb = cv2.cvtColor(person_bgr, cv2.COLOR_BGR2RGB)
    person_pil = Image.fromarray(person_rgb)

    # depth
    if use_depth_path:
        depth_map = make_depth(paths.depth_path)
    else:
        depth_map = make_depth_from_parsing_edges(parsing_img)

    # garment image (✅ 여기서부터가 핵심: 1024 폭 강제)
    personn = Image.open(paths.person_path).convert("RGB")
    garment_bgr = apply_parsing_white_mask_to_person_cv2(personn, parsing_img)
    garment_rgb = cv2.cvtColor(garment_bgr, cv2.COLOR_BGR2RGB)
    garment_rgb = cv2.resize(garment_rgb, (W, H), interpolation=cv2.INTER_AREA)
    garment_rgb = _pad_or_crop_to_width_np(garment_rgb, 1024, pad_value=[255, 255, 255])
    garment_pil = Image.fromarray(garment_rgb)

    # garment mask (✅ 동일하게 1024 맞춤)
    gm = np.array(parsing_img.convert("L"), dtype=np.uint8)
    gm = cv2.resize(gm, (W, H), interpolation=cv2.INTER_NEAREST)
    gm = cv2.cvtColor(gm, cv2.COLOR_GRAY2RGB)
    gm = _pad_or_crop_to_width_np(gm, 1024, pad_value=[0, 0, 0])
    garment_mask_pil = Image.fromarray(gm)

    # ✅ 조건에 따른 scale 결정
    style_present = _has_valid_file(paths.style_path)
    prompt_present = (prompt is not None) and (str(prompt).strip() != "")
    content_scale, style_scale = _resolve_content_style_scales(style_present, prompt_present)

    print(
        "[SIZE] person:", person_pil.size,
        "mask:", mask_pil.size,
        "depth:", depth_map.size,
        "garment:", garment_pil.size,
        "gmask:", garment_mask_pil.size,
        "ui_category:", category,
        "extractor_category:", extractor_category,
        "style_present:", style_present,
        "prompt_present:", prompt_present,
        "content_scale:", content_scale,
        "style_scale:", style_scale,
        flush=True
    )

    ip_model = IPAdapterXL(
        pipe,
        image_encoder_dir,
        ip_ckpt,
        device,
        mask_pil,
        person_pil,
        content_scale=content_scale,   # ✅ 변경
        style_scale=style_scale,       # ✅ 변경
        garment_images=garment_pil,
        garment_mask=garment_mask_pil,
    )

    if device == "cuda":
        pipe.to(dtype=torch.float32)
        try:
            for _, proc in pipe.unet.attn_processors.items():
                proc.to(dtype=torch.float32)
        except Exception:
            pass

    # ✅ style image 없을 때도 generate 입력이 None이 되지 않게 대체
    if style_present:
        style_img = Image.open(paths.style_path).convert("RGB")
    else:
        # scale이 0이므로 영향은 없고, 함수 시그니처만 만족시키기 위한 대체값
        style_img = garment_pil

    # prompt 구성은 기존 유지
    if prompt is not None and str(prompt).strip() != "":
        prompt = extractor_category + " with " + str(prompt).strip()
    else:
        prompt = extractor_category

    with torch.inference_mode():
        images = ip_model.generate(
            pil_image=style_img,
            image=person_pil,
            control_image=depth_map,
            strength=1.0,
            num_samples=1,
            num_inference_steps=int(steps),
            shape_prompt="",
            prompt=prompt or "",
            num=0,
            scale=None,
            controlnet_conditioning_scale=0.7,
            guidance_scale=7.5,
        )

#     save_cropped(images, paths.output_path)
#     return images, mask_pil, depth_map, person_pil, garment_pil, garment_mask_pil
    save_output_match_person(images, paths.output_path, paths.person_path)
    return images, mask_pil, depth_map, person_pil, garment_pil, garment_mask_pil



def set_seed(seed: int):
    if seed is None or seed < 0:
        return
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)


def infer_web(person_fp, sketch_fp, style_fp, prompt, steps, seed, category):
    print("[UI] infer_web called", flush=True)

    # ✅ person만 필수, style은 선택
    if person_fp is None:
        raise gr.Error("person 이미지는 필수입니다. (style/sketch는 선택)")

    if category not in ("Upper-body", "Lower-body", "Dress"):
        raise gr.Error(f"Invalid category: {category}")

    set_seed(int(seed) if seed is not None else -1)

    tmp_dir = tempfile.mkdtemp(prefix="vista_demo_")
    out_path = os.path.join(tmp_dir, "result.png")

    paths = Paths(
        person_path=person_fp,
        depth_path=sketch_fp,
        style_path=style_fp,   # ✅ None 가능
        output_path=out_path,
    )

    _, mask_pil, depth_map, person_pil, garment_pil, garment_mask_pil = run_one(
        paths, prompt=prompt, steps=int(steps), category=category
    )

    out_img = Image.open(out_path).convert("RGB")
    return out_img, out_path, mask_pil, depth_map, person_pil, garment_pil, garment_mask_pil


with gr.Blocks(title="VISTA Demo (HF Spaces)") as demo:
#     gr.Markdown("첫 실행은 모델 로딩 때문에 시간이 오래 걸릴 수 있습니다.")
#     gr.Markdown("category랑 try-on 하려는 옷 종류를 꼭 맞춰주세요.")

    gr.Markdown(
        "첫 실행은 모델 로딩 때문에 시간이 오래 걸릴 수 있습니다.<br>"
        "category랑 try-on 하려는 옷 종류를 꼭 맞춰주세요.",
        elem_classes="tight_md",
    )

    category_toggle = gr.Radio(
        choices=["Dress", "Upper-body", "Lower-body"],
        value="Dress",
        label="Category",
        interactive=True,
    )

    # ✅ 예시 리스트(분리)
    ex = build_ui_example_lists(ROOT)
    person_examples = [[p] for p in ex["persons"]]
    style_examples = [[p] for p in ex["styles"]]
    sketch_examples = [[p] for p in ex["sketches"]]

    # 한 행에 Person / Style / Output
    with gr.Row():
        # -------- Person column --------
        with gr.Column(scale=1):
            person_in = gr.Image(label="Person Image (required)", type="filepath")
            if person_examples:
                gr.Markdown("#### Examples")
                gr.Examples(
                    examples=person_examples,
                    inputs=[person_in],
                    examples_per_page=8,
                )

        # -------- Style column --------
        with gr.Column(scale=1):
            style_in = gr.Image(label="Style Image (optional)", type="filepath")
            if style_examples:
                gr.Markdown("#### Examples")
                gr.Examples(
                    examples=style_examples,
                    inputs=[style_in],
                    examples_per_page=8,
                )

        # -------- Output column --------
        with gr.Column(scale=1):
            out_img = gr.Image(label="Output", type="pil")

    with gr.Accordion("Sketch / Guide (optional)", open=False):
        sketch_in = gr.Image(
            label="Sketch / Guide (person과 같은 번호로 매칭하세요: person 1 ↔ sketch 1). 스케치는 person 인체와 정렬되어야 합니다.",
            type="filepath",
        )
        if sketch_examples:
            gr.Markdown("#### Examples")
            gr.Examples(
                examples=sketch_examples,
                inputs=[sketch_in],
                examples_per_page=8,
            )

    with gr.Row():
        prompt_in = gr.Textbox(
            label="Prompt",
            value="",
            placeholder="ex) crystal, lace, button, …",
            lines=2,
        )
        steps_in = gr.Slider(1, 80, value=DEFAULT_STEPS, step=1, label="Steps")
        seed_in = gr.Number(label="Seed (-1=random)", value=-1, precision=0)

    run_btn = gr.Button("Run")
    out_file = gr.File(label="Download result.png")

#     gr.Markdown("### Debug Visualizations (mask/depth/etc)")
#     with gr.Row():
#         dbg_mask = gr.Image(label="mask_pil", type="pil")
#         dbg_depth = gr.Image(label="depth_map", type="pil")

#     with gr.Row():
#         dbg_person = gr.Image(label="person_pil", type="pil")
#         dbg_garment = gr.Image(label="garment_pil", type="pil")
#         dbg_gmask = gr.Image(label="garment_mask_pil", type="pil")

    run_btn.click(
        fn=infer_web,
        inputs=[person_in, sketch_in, style_in, prompt_in, steps_in, seed_in, category_toggle],
        outputs=[out_img, out_file],
    )

demo.queue()
if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", server_port=7860)