dee-Z-Image-Turbo

Sleeping

App Files Files Community

telcom commited on Dec 28, 2025

Commit

59bda41

verified ·

1 Parent(s): fd2d5e7

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -124

app.py CHANGED Viewed

@@ -1,11 +1,8 @@
 # app.py
 # ============================================================
-# Automatic clothing replacement:
-#   1) Detect clothing boxes with GroundingDINO
-#   2) Turn boxes into pixel mask with SAM
-#   3) Inpaint mask with SDXL Inpaint
-#
-# Input: ONE image, NO manual paint, NO manual mask
 # ============================================================
 import os
@@ -21,28 +18,24 @@ from PIL import Image
 import torch
 from huggingface_hub import login, hf_hub_download
-# Diffusers SDXL inpaint
 from diffusers import StableDiffusionXLInpaintPipeline
-# GroundingDINO
 from groundingdino.util.inference import load_model, predict
-# SAM
 from segment_anything import sam_model_registry, SamPredictor
 # ============================================================
-# Spaces GPU decorator (must be imported early)
 # ============================================================
 try:
-    import spaces  # noqa: F401
     SPACES_AVAILABLE = True
 except Exception:
     SPACES_AVAILABLE = False
 # ============================================================
-# Basic config
 # ============================================================
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 warnings.filterwarnings("ignore")
@@ -54,29 +47,31 @@ if HF_TOKEN:
 MAX_SEED = np.iinfo(np.int32).max
-DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DTYPE = torch.bfloat16 if (DEVICE == "cuda" and torch.cuda.is_bf16_supported()) else (torch.float16 if DEVICE == "cuda" else torch.float32)
-MAX_IMAGE_SIZE = 1024 if DEVICE == "cuda" else 768
-# You can tune what the detector looks for
 DEFAULT_CLOTHING_QUERY = "shirt, t-shirt, jacket, coat, hoodie, sweater, dress, pants, jeans, skirt, clothing"
-# SDXL inpaint model
 INPAINT_MODEL = os.environ.get(
     "INPAINT_MODEL",
     "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
 ).strip()
-# Detection thresholds (tune for your data)
-DEFAULT_BOX_THRESHOLD = 0.35
-DEFAULT_TEXT_THRESHOLD = 0.25
 # ============================================================
-# Model loading with hf_hub_download (no local file assumptions)
 # ============================================================
 model_loaded = False
 load_error = None
@@ -84,13 +79,10 @@ dino = None
 sam_predictor = None
 pipe = None
-def _download_and_load_models():
     global dino, sam_predictor, pipe
-    # --------------------------
-    # 1) GroundingDINO download
-    # --------------------------
-    # Official repo commonly used on HF Hub
     DINO_REPO = "IDEA-Research/GroundingDINO"
     dino_cfg_path = hf_hub_download(
         repo_id=DINO_REPO,
@@ -104,41 +96,30 @@ def _download_and_load_models():
     )
     dino = load_model(dino_cfg_path, dino_ckpt_path)
-    # --------------------------
-    # 2) SAM download
-    # --------------------------
-    # Many installs use this HF repo mirror
     SAM_REPO = "facebook/sam-vit-huge"
     sam_ckpt_path = hf_hub_download(
         repo_id=SAM_REPO,
         filename="sam_vit_h_4b8939.pth",
         token=HF_TOKEN if HF_TOKEN else None,
     )
     sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt_path)
     sam.to(DEVICE)
     sam_predictor = SamPredictor(sam)
-    # --------------------------
-    # 3) SDXL Inpaint pipeline
-    # --------------------------
-    fp_kwargs = {
-        "torch_dtype": DTYPE,
-        "use_safetensors": True,
-    }
     if HF_TOKEN:
         fp_kwargs["token"] = HF_TOKEN
     pipe = StableDiffusionXLInpaintPipeline.from_pretrained(INPAINT_MODEL, **fp_kwargs).to(DEVICE)
     try:
         pipe.set_progress_bar_config(disable=True)
     except Exception:
         pass
 try:
-    _download_and_load_models()
     model_loaded = True
 except Exception as e:
     model_loaded = False
@@ -146,33 +127,30 @@ except Exception as e:
 # ============================================================
-# Image helpers
 # ============================================================
 def make_error_image(w: int, h: int) -> Image.Image:
     return Image.new("RGB", (int(w), int(h)), (18, 18, 22))
-def _fit_to_multiple_of_64(w: int, h: int):
-    # SDXL likes multiples of 64
     w = max(256, (w // 64) * 64)
     h = max(256, (h // 64) * 64)
     return w, h
-def _resize_rgb(img: Image.Image, w: int, h: int) -> Image.Image:
     return img.convert("RGB").resize((w, h), Image.LANCZOS)
-def _resize_mask(mask: Image.Image, w: int, h: int) -> Image.Image:
     return mask.convert("L").resize((w, h), Image.NEAREST)
-def _dilate_mask(mask_np: np.ndarray, radius: int) -> np.ndarray:
     if radius <= 0:
         return mask_np
     import cv2
     kernel = np.ones((radius * 2 + 1, radius * 2 + 1), np.uint8)
     return cv2.dilate(mask_np, kernel, iterations=1)
-def _largest_component(mask_np: np.ndarray) -> np.ndarray:
-    # Optional cleanup: keep only largest connected region
     import cv2
     num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask_np, connectivity=8)
     if num_labels <= 1:
@@ -183,10 +161,6 @@ def _largest_component(mask_np: np.ndarray) -> np.ndarray:
     return out
-# ============================================================
-# Detect clothing and create a mask
-# ============================================================
 def detect_clothing_mask(
     image: Image.Image,
     clothing_query: str,
@@ -195,19 +169,14 @@ def detect_clothing_mask(
     dilate_radius: int,
     keep_largest: bool,
 ):
-    """
-    Returns a PIL L mask: white = edit, black = keep
-    """
     if image is None:
         return None
     img_rgb = image.convert("RGB")
     w, h = img_rgb.size
-    # GroundingDINO expects numpy image (H,W,3) in RGB usually
     img_np = np.array(img_rgb)
-    boxes, logits, phrases = predict(
         model=dino,
         image=img_np,
         caption=clothing_query,
@@ -218,11 +187,9 @@ def detect_clothing_mask(
     if boxes is None or len(boxes) == 0:
         return None
-    # Convert boxes to pixel coords
-    # GroundingDINO returns boxes as [cx, cy, w, h] normalized (0..1)
     boxes_px = []
     for b in boxes:
-        cx, cy, bw, bh = float(b[0]), float(b[1]), float(b[2]), float(b[3])
         x1 = int((cx - bw / 2.0) * w)
         y1 = int((cy - bh / 2.0) * h)
         x2 = int((cx + bw / 2.0) * w)
@@ -237,37 +204,27 @@ def detect_clothing_mask(
     if not boxes_px:
         return None
-    # SAM segmentation on original resolution
     sam_predictor.set_image(img_np)
     full_mask = np.zeros((h, w), dtype=np.uint8)
     for box in boxes_px:
-        # SAM expects box in XYXY pixel coords
         box_arr = np.array(box, dtype=np.float32)
-        masks, scores, _ = sam_predictor.predict(
-            box=box_arr,
-            multimask_output=False,
-        )
         m = masks[0].astype(np.uint8) * 255
         full_mask = np.maximum(full_mask, m)
-    # Optional cleanup
     if keep_largest:
-        full_mask = _largest_component(full_mask)
-    # Optional dilation to cover seams and edges
-    full_mask = _dilate_mask(full_mask, int(dilate_radius))
     return Image.fromarray(full_mask, mode="L")
 # ============================================================
-# Inference
 # ============================================================
-def _infer_impl(
     image,
     prompt,
     negative_prompt,
@@ -290,7 +247,7 @@ def _infer_impl(
         return make_error_image(width, height), f"Model load failed: {load_error}"
     if image is None:
-        return make_error_image(width, height), "Error: please upload an image."
     prompt = (prompt or "").strip()
     if not prompt:
@@ -300,11 +257,8 @@ def _infer_impl(
     if not neg:
         neg = None
-    clothing_query = (clothing_query or "").strip()
-    if not clothing_query:
-        clothing_query = DEFAULT_CLOTHING_QUERY
-    # Seed
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     else:
@@ -312,12 +266,10 @@ def _infer_impl(
     generator = torch.Generator(device=DEVICE).manual_seed(seed)
-    # Fit resolution
-    width, height = _fit_to_multiple_of_64(width, height)
     width = min(width, MAX_IMAGE_SIZE)
     height = min(height, MAX_IMAGE_SIZE)
-    # Detect clothing mask on original image
     mask = detect_clothing_mask(
         image=image,
         clothing_query=clothing_query,
@@ -328,19 +280,18 @@ def _infer_impl(
     )
     if mask is None:
-        return image, f"Seed: {seed}. No clothing region detected, try adjusting thresholds or query."
-    # Resize image and mask to target size
-    img_resized = _resize_rgb(image, width, height)
-    mask_resized = _resize_mask(mask, width, height)
     status = f"Seed: {seed}"
-    if DEVICE != "cuda":
-        status += " | Running on CPU, this will be slow."
     try:
         with torch.inference_mode():
-            if DEVICE == "cuda":
                 with torch.autocast("cuda", dtype=DTYPE):
                     out = pipe(
                         prompt=prompt,
@@ -362,73 +313,79 @@ def _infer_impl(
                     generator=generator,
                 )
-        result = out.images[0]
-        return result, status
     except Exception as e:
         return make_error_image(width, height), f"Error: {type(e).__name__}: {e}"
     finally:
         gc.collect()
-        if DEVICE == "cuda":
             torch.cuda.empty_cache()
 if SPACES_AVAILABLE:
     @spaces.GPU
     def infer(*args, **kwargs):
-        return _infer_impl(*args, **kwargs)
 else:
     def infer(*args, **kwargs):
-        return _infer_impl(*args, **kwargs)
 # ============================================================
 # UI
 # ============================================================
-CSS = """
-body { background: #000; color: #fff; }
-"""
 with gr.Blocks(title="Auto Clothing Replacement") as demo:
     gr.HTML(f"<style>{CSS}</style>")
     gr.Markdown("## Automatic Clothing Replacement (no paint, no manual mask)")
-    gr.Markdown("Upload a photo, describe the new clothing. The system detects clothing and inpaints automatically.")
     if not model_loaded:
         gr.Markdown(f"⚠️ Model failed to load:\n\n{load_error}")
-    with gr.Row():
-        image = gr.Image(type="pil", label="Input image")
-    prompt = gr.Textbox(label="Prompt (describe new clothing)", lines=2, placeholder="e.g., a red leather jacket with zipper, realistic fabric folds")
-    negative_prompt = gr.Textbox(label="Negative prompt (optional)", lines=2, placeholder="e.g., blurry, deformed, low quality")
-    run_button = gr.Button("Replace Clothing")
-    result = gr.Image(label="Result")
     status = gr.Markdown("")
     with gr.Accordion("Advanced settings", open=False):
-        clothing_query = gr.Textbox(label="Detection query (what counts as clothing)", value=DEFAULT_CLOTHING_QUERY)
         seed = gr.Slider(0, MAX_SEED, step=1, value=0, label="Seed")
         randomize_seed = gr.Checkbox(value=True, label="Randomize seed")
-        width = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=768 if DEVICE != "cuda" else 1024, label="Width")
-        height = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=768 if DEVICE != "cuda" else 1024, label="Height")
         guidance_scale = gr.Slider(0.0, 15.0, step=0.1, value=7.0, label="Guidance scale")
         num_inference_steps = gr.Slider(1, 80, step=1, value=30, label="Steps")
-        box_threshold = gr.Slider(0.05, 0.90, step=0.01, value=DEFAULT_BOX_THRESHOLD, label="Box threshold (GroundingDINO)")
-        text_threshold = gr.Slider(0.05, 0.90, step=0.01, value=DEFAULT_TEXT_THRESHOLD, label="Text threshold (GroundingDINO)")
-        dilate_radius = gr.Slider(0, 30, step=1, value=8, label="Mask dilation radius (cover edges)")
-        keep_largest = gr.Checkbox(value=True, label="Keep only largest clothing region")
-    run_button.click(
         fn=infer,
         inputs=[
             image,
@@ -446,8 +403,8 @@ with gr.Blocks(title="Auto Clothing Replacement") as demo:
             dilate_radius,
             keep_largest,
         ],
-        outputs=[result, status],
     )
 if __name__ == "__main__":
-    demo.queue().launch(ssr_mode=False)

 # app.py
 # ============================================================
+# Automatic clothing replacement (no paint, no manual mask)
+# GroundingDINO -> SAM -> SDXL Inpaint
+# Fixes: Spaces requires @spaces.GPU function at startup
 # ============================================================
 import os
 import torch
 from huggingface_hub import login, hf_hub_download
 from diffusers import StableDiffusionXLInpaintPipeline
 from groundingdino.util.inference import load_model, predict
 from segment_anything import sam_model_registry, SamPredictor
 # ============================================================
+# Spaces import (do not hide the decorated function)
 # ============================================================
 try:
+    import spaces
     SPACES_AVAILABLE = True
 except Exception:
+    spaces = None
     SPACES_AVAILABLE = False
 # ============================================================
+# Config
 # ============================================================
 os.environ["TOKENIZERS_PARALLELISM"] = "false"
 warnings.filterwarnings("ignore")
 MAX_SEED = np.iinfo(np.int32).max
+CUDA_OK = torch.cuda.is_available()
+DEVICE = "cuda" if CUDA_OK else "cpu"
+if CUDA_OK and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
+    DTYPE = torch.bfloat16
+elif CUDA_OK:
+    DTYPE = torch.float16
+else:
+    DTYPE = torch.float32
+MAX_IMAGE_SIZE = 1024 if CUDA_OK else 768
 DEFAULT_CLOTHING_QUERY = "shirt, t-shirt, jacket, coat, hoodie, sweater, dress, pants, jeans, skirt, clothing"
+DEFAULT_BOX_THRESHOLD = 0.35
+DEFAULT_TEXT_THRESHOLD = 0.25
 INPAINT_MODEL = os.environ.get(
     "INPAINT_MODEL",
     "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
 ).strip()
 # ============================================================
+# Load models (download from HF Hub)
 # ============================================================
 model_loaded = False
 load_error = None
 sam_predictor = None
 pipe = None
+def download_and_load_models():
     global dino, sam_predictor, pipe
+    # ---- GroundingDINO ----
     DINO_REPO = "IDEA-Research/GroundingDINO"
     dino_cfg_path = hf_hub_download(
         repo_id=DINO_REPO,
     )
     dino = load_model(dino_cfg_path, dino_ckpt_path)
+    # ---- SAM ----
     SAM_REPO = "facebook/sam-vit-huge"
     sam_ckpt_path = hf_hub_download(
         repo_id=SAM_REPO,
         filename="sam_vit_h_4b8939.pth",
         token=HF_TOKEN if HF_TOKEN else None,
     )
     sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt_path)
     sam.to(DEVICE)
     sam_predictor = SamPredictor(sam)
+    # ---- SDXL Inpaint ----
+    fp_kwargs = {"torch_dtype": DTYPE, "use_safetensors": True}
     if HF_TOKEN:
         fp_kwargs["token"] = HF_TOKEN
     pipe = StableDiffusionXLInpaintPipeline.from_pretrained(INPAINT_MODEL, **fp_kwargs).to(DEVICE)
     try:
         pipe.set_progress_bar_config(disable=True)
     except Exception:
         pass
 try:
+    download_and_load_models()
     model_loaded = True
 except Exception as e:
     model_loaded = False
 # ============================================================
+# Helpers
 # ============================================================
 def make_error_image(w: int, h: int) -> Image.Image:
     return Image.new("RGB", (int(w), int(h)), (18, 18, 22))
+def fit64(w: int, h: int):
     w = max(256, (w // 64) * 64)
     h = max(256, (h // 64) * 64)
     return w, h
+def resize_rgb(img: Image.Image, w: int, h: int) -> Image.Image:
     return img.convert("RGB").resize((w, h), Image.LANCZOS)
+def resize_mask(mask: Image.Image, w: int, h: int) -> Image.Image:
     return mask.convert("L").resize((w, h), Image.NEAREST)
+def dilate_mask(mask_np: np.ndarray, radius: int) -> np.ndarray:
     if radius <= 0:
         return mask_np
     import cv2
     kernel = np.ones((radius * 2 + 1, radius * 2 + 1), np.uint8)
     return cv2.dilate(mask_np, kernel, iterations=1)
+def largest_component(mask_np: np.ndarray) -> np.ndarray:
     import cv2
     num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask_np, connectivity=8)
     if num_labels <= 1:
     return out
 def detect_clothing_mask(
     image: Image.Image,
     clothing_query: str,
     dilate_radius: int,
     keep_largest: bool,
 ):
     if image is None:
         return None
     img_rgb = image.convert("RGB")
     w, h = img_rgb.size
     img_np = np.array(img_rgb)
+    boxes, _, _ = predict(
         model=dino,
         image=img_np,
         caption=clothing_query,
     if boxes is None or len(boxes) == 0:
         return None
     boxes_px = []
     for b in boxes:
+        cx, cy, bw, bh = map(float, b)
         x1 = int((cx - bw / 2.0) * w)
         y1 = int((cy - bh / 2.0) * h)
         x2 = int((cx + bw / 2.0) * w)
     if not boxes_px:
         return None
     sam_predictor.set_image(img_np)
     full_mask = np.zeros((h, w), dtype=np.uint8)
     for box in boxes_px:
         box_arr = np.array(box, dtype=np.float32)
+        masks, _, _ = sam_predictor.predict(box=box_arr, multimask_output=False)
         m = masks[0].astype(np.uint8) * 255
         full_mask = np.maximum(full_mask, m)
     if keep_largest:
+        full_mask = largest_component(full_mask)
+    full_mask = dilate_mask(full_mask, int(dilate_radius))
     return Image.fromarray(full_mask, mode="L")
 # ============================================================
+# Core inference (no decorator here)
 # ============================================================
+def infer_core(
     image,
     prompt,
     negative_prompt,
         return make_error_image(width, height), f"Model load failed: {load_error}"
     if image is None:
+        return make_error_image(width, height), "Error: upload an image."
     prompt = (prompt or "").strip()
     if not prompt:
     if not neg:
         neg = None
+    clothing_query = (clothing_query or "").strip() or DEFAULT_CLOTHING_QUERY
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
     else:
     generator = torch.Generator(device=DEVICE).manual_seed(seed)
+    width, height = fit64(width, height)
     width = min(width, MAX_IMAGE_SIZE)
     height = min(height, MAX_IMAGE_SIZE)
     mask = detect_clothing_mask(
         image=image,
         clothing_query=clothing_query,
     )
     if mask is None:
+        return image, f"Seed: {seed}. No clothing detected, try lowering thresholds or changing query."
+    img_resized = resize_rgb(image, width, height)
+    mask_resized = resize_mask(mask, width, height)
     status = f"Seed: {seed}"
+    if not CUDA_OK:
+        status += " | CPU only (slow)."
     try:
         with torch.inference_mode():
+            if CUDA_OK:
                 with torch.autocast("cuda", dtype=DTYPE):
                     out = pipe(
                         prompt=prompt,
                     generator=generator,
                 )
+        return out.images[0], status
     except Exception as e:
         return make_error_image(width, height), f"Error: {type(e).__name__}: {e}"
     finally:
         gc.collect()
+        if CUDA_OK:
             torch.cuda.empty_cache()
+# ============================================================
+# IMPORTANT: Always define a @spaces.GPU function if spaces imports
+# (Spaces startup checker requires it)
+# ============================================================
 if SPACES_AVAILABLE:
     @spaces.GPU
     def infer(*args, **kwargs):
+        return infer_core(*args, **kwargs)
 else:
     def infer(*args, **kwargs):
+        return infer_core(*args, **kwargs)
 # ============================================================
 # UI
 # ============================================================
+CSS = "body { background: #000; color: #fff; }"
 with gr.Blocks(title="Auto Clothing Replacement") as demo:
     gr.HTML(f"<style>{CSS}</style>")
     gr.Markdown("## Automatic Clothing Replacement (no paint, no manual mask)")
+    gr.Markdown("Upload a photo, describe the new clothing. Detection and masking is automatic.")
     if not model_loaded:
         gr.Markdown(f"⚠️ Model failed to load:\n\n{load_error}")
+    image = gr.Image(type="pil", label="Input image")
+    prompt = gr.Textbox(
+        label="Prompt (describe new clothing)",
+        lines=2,
+        placeholder="e.g., a navy business suit jacket, realistic fabric folds, studio lighting",
+    )
+    negative_prompt = gr.Textbox(
+        label="Negative prompt (optional)",
+        lines=2,
+        placeholder="e.g., blurry, deformed, low quality",
+    )
+    run = gr.Button("Replace Clothing")
+    out_img = gr.Image(label="Result")
     status = gr.Markdown("")
     with gr.Accordion("Advanced settings", open=False):
+        clothing_query = gr.Textbox(label="Detection query", value=DEFAULT_CLOTHING_QUERY)
         seed = gr.Slider(0, MAX_SEED, step=1, value=0, label="Seed")
         randomize_seed = gr.Checkbox(value=True, label="Randomize seed")
+        width = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=768 if not CUDA_OK else 1024, label="Width")
+        height = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=768 if not CUDA_OK else 1024, label="Height")
         guidance_scale = gr.Slider(0.0, 15.0, step=0.1, value=7.0, label="Guidance scale")
         num_inference_steps = gr.Slider(1, 80, step=1, value=30, label="Steps")
+        box_threshold = gr.Slider(0.05, 0.90, step=0.01, value=DEFAULT_BOX_THRESHOLD, label="Box threshold (DINO)")
+        text_threshold = gr.Slider(0.05, 0.90, step=0.01, value=DEFAULT_TEXT_THRESHOLD, label="Text threshold (DINO)")
+        dilate_radius = gr.Slider(0, 30, step=1, value=8, label="Mask dilation radius")
+        keep_largest = gr.Checkbox(value=True, label="Keep only largest region")
+    run.click(
         fn=infer,
         inputs=[
             image,
             dilate_radius,
             keep_largest,
         ],
+        outputs=[out_img, status],
     )
 if __name__ == "__main__":
+    demo.queue().launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)