dee-Z-Image-Turbo

Sleeping

App Files Files Community

telcom commited on Dec 28, 2025

Commit

fd2d5e7

verified ·

1 Parent(s): f762924

Update app.py

Browse files

Files changed (1) hide show

app.py +384 -103

app.py CHANGED Viewed

@@ -1,172 +1,453 @@
 import os
 import gc
 import random
 import numpy as np
-import torch
 import gradio as gr
-import cv2
 from PIL import Image
 from diffusers import StableDiffusionXLInpaintPipeline
-from huggingface_hub import login
-# --- GroundingDINO ---
 from groundingdino.util.inference import load_model, predict
-# --- SAM ---
 from segment_anything import sam_model_registry, SamPredictor
 # ============================================================
-# CONFIG
 # ============================================================
-HF_TOKEN = os.getenv("HF_TOKEN", "")
 if HF_TOKEN:
-    login(HF_TOKEN)
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DTYPE = torch.float16 if DEVICE == "cuda" else torch.float32
-INPAINT_MODEL = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
-# Clothing keywords (you can tune this)
-CLOTHING_PROMPT = "shirt, jacket, coat, dress, hoodie, sweater, t-shirt"
 # ============================================================
-# LOAD MODELS
 # ============================================================
-# --- GroundingDINO ---
-dino = load_model(
-    "GroundingDINO/groundingdino_swint_ogc.pth",
-    "GroundingDINO/groundingdino_swint_ogc.cfg.py",
-)
-# --- SAM ---
-sam = sam_model_registry["vit_h"](
-    checkpoint="sam_vit_h_4b8939.pth"
-)
-sam.to(DEVICE)
-sam_predictor = SamPredictor(sam)
-# --- SDXL Inpaint ---
-pipe = StableDiffusionXLInpaintPipeline.from_pretrained(
-    INPAINT_MODEL,
-    torch_dtype=DTYPE,
-    use_safetensors=True,
-).to(DEVICE)
-pipe.set_progress_bar_config(disable=True)
 # ============================================================
-# UTILS
 # ============================================================
-def pil_to_cv(img):
-    return cv2.cvtColor(np.array(img), cv2.COLOR_RGB2BGR)
-def cv_to_pil(img):
-    return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
-def detect_clothing_mask(image: Image.Image):
-    """Automatically detect clothing and return a binary mask"""
-    img_cv = pil_to_cv(image)
-    h, w, _ = img_cv.shape
-    boxes, _, _ = predict(
         model=dino,
-        image=img_cv,
-        caption=CLOTHING_PROMPT,
-        box_threshold=0.35,
-        text_threshold=0.25,
     )
-    if len(boxes) == 0:
         return None
-    # Convert normalized boxes to pixels
     boxes_px = []
-    for box in boxes:
-        x1 = int((box[0] - box[2] / 2) * w)
-        y1 = int((box[1] - box[3] / 2) * h)
-        x2 = int((box[0] + box[2] / 2) * w)
-        y2 = int((box[1] + box[3] / 2) * h)
-        boxes_px.append([x1, y1, x2, y2])
-    # SAM segmentation
-    sam_predictor.set_image(img_cv)
-    masks = []
     for box in boxes_px:
-        mask, _, _ = sam_predictor.predict(
-            box=np.array(box),
             multimask_output=False,
         )
-        masks.append(mask[0])
-    # Merge all masks
-    full_mask = np.zeros((h, w), dtype=np.uint8)
-    for m in masks:
-        full_mask[m] = 255
-    return Image.fromarray(full_mask)
 # ============================================================
-# INFERENCE
 # ============================================================
-def replace_clothing(image, prompt, seed):
-    if image is None or not prompt:
-        return None, "Upload an image and provide a prompt."
-    mask = detect_clothing_mask(image)
-    if mask is None:
-        return image, "No clothing detected."
     generator = torch.Generator(device=DEVICE).manual_seed(seed)
-    with torch.inference_mode():
-        out = pipe(
-            prompt=prompt,
-            image=image,
-            mask_image=mask,
-            guidance_scale=7.0,
-            num_inference_steps=30,
-            generator=generator,
-        )
-    gc.collect()
-    if DEVICE == "cuda":
-        torch.cuda.empty_cache()
-    return out.images[0], "Clothing replaced automatically."
 # ============================================================
 # UI
 # ============================================================
-with gr.Blocks(title="Auto Clothing Replacement") as demo:
-    gr.Markdown("## Automatic Clothing Replacement (no mask, no painting)")
-    gr.Markdown("Upload a photo, describe the new outfit. Everything else is automatic.")
-    image = gr.Image(type="pil", label="Input image")
-    prompt = gr.Textbox(label="New clothing description")
-    seed = gr.Slider(0, 999999, value=0, label="Seed")
-    run = gr.Button("Replace Clothing")
-    output = gr.Image(label="Result")
-    status = gr.Markdown()
-    run.click(
-        replace_clothing,
-        inputs=[image, prompt, seed],
-        outputs=[output, status],
     )
-demo.launch()

+# app.py
+# ============================================================
+# Automatic clothing replacement:
+#   1) Detect clothing boxes with GroundingDINO
+#   2) Turn boxes into pixel mask with SAM
+#   3) Inpaint mask with SDXL Inpaint
+#
+# Input: ONE image, NO manual paint, NO manual mask
+# ============================================================
 import os
 import gc
 import random
+import warnings
+import logging
 import numpy as np
 import gradio as gr
 from PIL import Image
+import torch
+from huggingface_hub import login, hf_hub_download
+# Diffusers SDXL inpaint
 from diffusers import StableDiffusionXLInpaintPipeline
+# GroundingDINO
 from groundingdino.util.inference import load_model, predict
+# SAM
 from segment_anything import sam_model_registry, SamPredictor
 # ============================================================
+# Spaces GPU decorator (must be imported early)
+# ============================================================
+try:
+    import spaces  # noqa: F401
+    SPACES_AVAILABLE = True
+except Exception:
+    SPACES_AVAILABLE = False
+# ============================================================
+# Basic config
 # ============================================================
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+warnings.filterwarnings("ignore")
+logging.getLogger("transformers").setLevel(logging.ERROR)
+HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
 if HF_TOKEN:
+    login(token=HF_TOKEN)
+MAX_SEED = np.iinfo(np.int32).max
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = torch.bfloat16 if (DEVICE == "cuda" and torch.cuda.is_bf16_supported()) else (torch.float16 if DEVICE == "cuda" else torch.float32)
+MAX_IMAGE_SIZE = 1024 if DEVICE == "cuda" else 768
+# You can tune what the detector looks for
+DEFAULT_CLOTHING_QUERY = "shirt, t-shirt, jacket, coat, hoodie, sweater, dress, pants, jeans, skirt, clothing"
+# SDXL inpaint model
+INPAINT_MODEL = os.environ.get(
+    "INPAINT_MODEL",
+    "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
+).strip()
+# Detection thresholds (tune for your data)
+DEFAULT_BOX_THRESHOLD = 0.35
+DEFAULT_TEXT_THRESHOLD = 0.25
 # ============================================================
+# Model loading with hf_hub_download (no local file assumptions)
 # ============================================================
+model_loaded = False
+load_error = None
+dino = None
+sam_predictor = None
+pipe = None
+def _download_and_load_models():
+    global dino, sam_predictor, pipe
+    # --------------------------
+    # 1) GroundingDINO download
+    # --------------------------
+    # Official repo commonly used on HF Hub
+    DINO_REPO = "IDEA-Research/GroundingDINO"
+    dino_cfg_path = hf_hub_download(
+        repo_id=DINO_REPO,
+        filename="groundingdino/config/GroundingDINO_SwinT_OGC.py",
+        token=HF_TOKEN if HF_TOKEN else None,
+    )
+    dino_ckpt_path = hf_hub_download(
+        repo_id=DINO_REPO,
+        filename="groundingdino_swint_ogc.pth",
+        token=HF_TOKEN if HF_TOKEN else None,
+    )
+    dino = load_model(dino_cfg_path, dino_ckpt_path)
+    # --------------------------
+    # 2) SAM download
+    # --------------------------
+    # Many installs use this HF repo mirror
+    SAM_REPO = "facebook/sam-vit-huge"
+    sam_ckpt_path = hf_hub_download(
+        repo_id=SAM_REPO,
+        filename="sam_vit_h_4b8939.pth",
+        token=HF_TOKEN if HF_TOKEN else None,
+    )
+    sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt_path)
+    sam.to(DEVICE)
+    sam_predictor = SamPredictor(sam)
+    # --------------------------
+    # 3) SDXL Inpaint pipeline
+    # --------------------------
+    fp_kwargs = {
+        "torch_dtype": DTYPE,
+        "use_safetensors": True,
+    }
+    if HF_TOKEN:
+        fp_kwargs["token"] = HF_TOKEN
+    pipe = StableDiffusionXLInpaintPipeline.from_pretrained(INPAINT_MODEL, **fp_kwargs).to(DEVICE)
+    try:
+        pipe.set_progress_bar_config(disable=True)
+    except Exception:
+        pass
+try:
+    _download_and_load_models()
+    model_loaded = True
+except Exception as e:
+    model_loaded = False
+    load_error = repr(e)
 # ============================================================
+# Image helpers
 # ============================================================
+def make_error_image(w: int, h: int) -> Image.Image:
+    return Image.new("RGB", (int(w), int(h)), (18, 18, 22))
+def _fit_to_multiple_of_64(w: int, h: int):
+    # SDXL likes multiples of 64
+    w = max(256, (w // 64) * 64)
+    h = max(256, (h // 64) * 64)
+    return w, h
+def _resize_rgb(img: Image.Image, w: int, h: int) -> Image.Image:
+    return img.convert("RGB").resize((w, h), Image.LANCZOS)
+def _resize_mask(mask: Image.Image, w: int, h: int) -> Image.Image:
+    return mask.convert("L").resize((w, h), Image.NEAREST)
+def _dilate_mask(mask_np: np.ndarray, radius: int) -> np.ndarray:
+    if radius <= 0:
+        return mask_np
+    import cv2
+    kernel = np.ones((radius * 2 + 1, radius * 2 + 1), np.uint8)
+    return cv2.dilate(mask_np, kernel, iterations=1)
+def _largest_component(mask_np: np.ndarray) -> np.ndarray:
+    # Optional cleanup: keep only largest connected region
+    import cv2
+    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask_np, connectivity=8)
+    if num_labels <= 1:
+        return mask_np
+    largest = 1 + np.argmax(stats[1:, cv2.CC_STAT_AREA])
+    out = np.zeros_like(mask_np)
+    out[labels == largest] = 255
+    return out
+# ============================================================
+# Detect clothing and create a mask
+# ============================================================
+def detect_clothing_mask(
+    image: Image.Image,
+    clothing_query: str,
+    box_threshold: float,
+    text_threshold: float,
+    dilate_radius: int,
+    keep_largest: bool,
+):
+    """
+    Returns a PIL L mask: white = edit, black = keep
+    """
+    if image is None:
+        return None
+    img_rgb = image.convert("RGB")
+    w, h = img_rgb.size
+    # GroundingDINO expects numpy image (H,W,3) in RGB usually
+    img_np = np.array(img_rgb)
+    boxes, logits, phrases = predict(
         model=dino,
+        image=img_np,
+        caption=clothing_query,
+        box_threshold=float(box_threshold),
+        text_threshold=float(text_threshold),
     )
+    if boxes is None or len(boxes) == 0:
         return None
+    # Convert boxes to pixel coords
+    # GroundingDINO returns boxes as [cx, cy, w, h] normalized (0..1)
     boxes_px = []
+    for b in boxes:
+        cx, cy, bw, bh = float(b[0]), float(b[1]), float(b[2]), float(b[3])
+        x1 = int((cx - bw / 2.0) * w)
+        y1 = int((cy - bh / 2.0) * h)
+        x2 = int((cx + bw / 2.0) * w)
+        y2 = int((cy + bh / 2.0) * h)
+        x1 = max(0, min(w - 1, x1))
+        y1 = max(0, min(h - 1, y1))
+        x2 = max(0, min(w - 1, x2))
+        y2 = max(0, min(h - 1, y2))
+        if x2 > x1 and y2 > y1:
+            boxes_px.append([x1, y1, x2, y2])
+    if not boxes_px:
+        return None
+    # SAM segmentation on original resolution
+    sam_predictor.set_image(img_np)
+    full_mask = np.zeros((h, w), dtype=np.uint8)
     for box in boxes_px:
+        # SAM expects box in XYXY pixel coords
+        box_arr = np.array(box, dtype=np.float32)
+        masks, scores, _ = sam_predictor.predict(
+            box=box_arr,
             multimask_output=False,
         )
+        m = masks[0].astype(np.uint8) * 255
+        full_mask = np.maximum(full_mask, m)
+    # Optional cleanup
+    if keep_largest:
+        full_mask = _largest_component(full_mask)
+    # Optional dilation to cover seams and edges
+    full_mask = _dilate_mask(full_mask, int(dilate_radius))
+    return Image.fromarray(full_mask, mode="L")
 # ============================================================
+# Inference
 # ============================================================
+def _infer_impl(
+    image,
+    prompt,
+    negative_prompt,
+    clothing_query,
+    seed,
+    randomize_seed,
+    width,
+    height,
+    guidance_scale,
+    num_inference_steps,
+    box_threshold,
+    text_threshold,
+    dilate_radius,
+    keep_largest,
+):
+    width = int(width)
+    height = int(height)
+    if not model_loaded:
+        return make_error_image(width, height), f"Model load failed: {load_error}"
+    if image is None:
+        return make_error_image(width, height), "Error: please upload an image."
+    prompt = (prompt or "").strip()
+    if not prompt:
+        return make_error_image(width, height), "Error: prompt is empty."
+    neg = (negative_prompt or "").strip()
+    if not neg:
+        neg = None
+    clothing_query = (clothing_query or "").strip()
+    if not clothing_query:
+        clothing_query = DEFAULT_CLOTHING_QUERY
+    # Seed
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    else:
+        seed = int(seed)
     generator = torch.Generator(device=DEVICE).manual_seed(seed)
+    # Fit resolution
+    width, height = _fit_to_multiple_of_64(width, height)
+    width = min(width, MAX_IMAGE_SIZE)
+    height = min(height, MAX_IMAGE_SIZE)
+    # Detect clothing mask on original image
+    mask = detect_clothing_mask(
+        image=image,
+        clothing_query=clothing_query,
+        box_threshold=float(box_threshold),
+        text_threshold=float(text_threshold),
+        dilate_radius=int(dilate_radius),
+        keep_largest=bool(keep_largest),
+    )
+    if mask is None:
+        return image, f"Seed: {seed}. No clothing region detected, try adjusting thresholds or query."
+    # Resize image and mask to target size
+    img_resized = _resize_rgb(image, width, height)
+    mask_resized = _resize_mask(mask, width, height)
+    status = f"Seed: {seed}"
+    if DEVICE != "cuda":
+        status += " | Running on CPU, this will be slow."
+    try:
+        with torch.inference_mode():
+            if DEVICE == "cuda":
+                with torch.autocast("cuda", dtype=DTYPE):
+                    out = pipe(
+                        prompt=prompt,
+                        negative_prompt=neg,
+                        image=img_resized,
+                        mask_image=mask_resized,
+                        guidance_scale=float(guidance_scale),
+                        num_inference_steps=int(num_inference_steps),
+                        generator=generator,
+                    )
+            else:
+                out = pipe(
+                    prompt=prompt,
+                    negative_prompt=neg,
+                    image=img_resized,
+                    mask_image=mask_resized,
+                    guidance_scale=float(guidance_scale),
+                    num_inference_steps=int(num_inference_steps),
+                    generator=generator,
+                )
+        result = out.images[0]
+        return result, status
+    except Exception as e:
+        return make_error_image(width, height), f"Error: {type(e).__name__}: {e}"
+    finally:
+        gc.collect()
+        if DEVICE == "cuda":
+            torch.cuda.empty_cache()
+if SPACES_AVAILABLE:
+    @spaces.GPU
+    def infer(*args, **kwargs):
+        return _infer_impl(*args, **kwargs)
+else:
+    def infer(*args, **kwargs):
+        return _infer_impl(*args, **kwargs)
 # ============================================================
 # UI
 # ============================================================
+CSS = """
+body { background: #000; color: #fff; }
+"""
+with gr.Blocks(title="Auto Clothing Replacement") as demo:
+    gr.HTML(f"<style>{CSS}</style>")
+    gr.Markdown("## Automatic Clothing Replacement (no paint, no manual mask)")
+    gr.Markdown("Upload a photo, describe the new clothing. The system detects clothing and inpaints automatically.")
+    if not model_loaded:
+        gr.Markdown(f"⚠️ Model failed to load:\n\n{load_error}")
+    with gr.Row():
+        image = gr.Image(type="pil", label="Input image")
+    prompt = gr.Textbox(label="Prompt (describe new clothing)", lines=2, placeholder="e.g., a red leather jacket with zipper, realistic fabric folds")
+    negative_prompt = gr.Textbox(label="Negative prompt (optional)", lines=2, placeholder="e.g., blurry, deformed, low quality")
+    run_button = gr.Button("Replace Clothing")
+    result = gr.Image(label="Result")
+    status = gr.Markdown("")
+    with gr.Accordion("Advanced settings", open=False):
+        clothing_query = gr.Textbox(label="Detection query (what counts as clothing)", value=DEFAULT_CLOTHING_QUERY)
+        seed = gr.Slider(0, MAX_SEED, step=1, value=0, label="Seed")
+        randomize_seed = gr.Checkbox(value=True, label="Randomize seed")
+        width = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=768 if DEVICE != "cuda" else 1024, label="Width")
+        height = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=768 if DEVICE != "cuda" else 1024, label="Height")
+        guidance_scale = gr.Slider(0.0, 15.0, step=0.1, value=7.0, label="Guidance scale")
+        num_inference_steps = gr.Slider(1, 80, step=1, value=30, label="Steps")
+        box_threshold = gr.Slider(0.05, 0.90, step=0.01, value=DEFAULT_BOX_THRESHOLD, label="Box threshold (GroundingDINO)")
+        text_threshold = gr.Slider(0.05, 0.90, step=0.01, value=DEFAULT_TEXT_THRESHOLD, label="Text threshold (GroundingDINO)")
+        dilate_radius = gr.Slider(0, 30, step=1, value=8, label="Mask dilation radius (cover edges)")
+        keep_largest = gr.Checkbox(value=True, label="Keep only largest clothing region")
+    run_button.click(
+        fn=infer,
+        inputs=[
+            image,
+            prompt,
+            negative_prompt,
+            clothing_query,
+            seed,
+            randomize_seed,
+            width,
+            height,
+            guidance_scale,
+            num_inference_steps,
+            box_threshold,
+            text_threshold,
+            dilate_radius,
+            keep_largest,
+        ],
+        outputs=[result, status],
     )
+if __name__ == "__main__":
+    demo.queue().launch(ssr_mode=False)