dee-Z-Image-Turbo

Sleeping

App Files Files Community

telcom commited on Dec 28, 2025

Commit

ef8496d

verified ·

1 Parent(s): 59bda41

Update app.py

Browse files

Files changed (1) hide show

app.py +224 -279

app.py CHANGED Viewed

@@ -1,8 +1,6 @@
 # app.py
 # ============================================================
-# Automatic clothing replacement (no paint, no manual mask)
-# GroundingDINO -> SAM -> SDXL Inpaint
-# Fixes: Spaces requires @spaces.GPU function at startup
 # ============================================================
 import os
@@ -10,401 +8,348 @@ import gc
 import random
 import warnings
 import logging
-import numpy as np
 import gradio as gr
 from PIL import Image
 import torch
-from huggingface_hub import login, hf_hub_download
-from diffusers import StableDiffusionXLInpaintPipeline
-from groundingdino.util.inference import load_model, predict
-from segment_anything import sam_model_registry, SamPredictor
 # ============================================================
-# Spaces import (do not hide the decorated function)
 # ============================================================
-try:
-    import spaces
-    SPACES_AVAILABLE = True
-except Exception:
-    spaces = None
-    SPACES_AVAILABLE = False
 # ============================================================
 # Config
 # ============================================================
-os.environ["TOKENIZERS_PARALLELISM"] = "false"
-warnings.filterwarnings("ignore")
-logging.getLogger("transformers").setLevel(logging.ERROR)
 HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
 if HF_TOKEN:
     login(token=HF_TOKEN)
-MAX_SEED = np.iinfo(np.int32).max
-CUDA_OK = torch.cuda.is_available()
-DEVICE = "cuda" if CUDA_OK else "cpu"
-if CUDA_OK and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
-    DTYPE = torch.bfloat16
-elif CUDA_OK:
-    DTYPE = torch.float16
-else:
-    DTYPE = torch.float32
-MAX_IMAGE_SIZE = 1024 if CUDA_OK else 768
-DEFAULT_CLOTHING_QUERY = "shirt, t-shirt, jacket, coat, hoodie, sweater, dress, pants, jeans, skirt, clothing"
-DEFAULT_BOX_THRESHOLD = 0.35
-DEFAULT_TEXT_THRESHOLD = 0.25
-INPAINT_MODEL = os.environ.get(
-    "INPAINT_MODEL",
-    "diffusers/stable-diffusion-xl-1.0-inpainting-0.1"
-).strip()
 # ============================================================
-# Load models (download from HF Hub)
 # ============================================================
 model_loaded = False
 load_error = None
-dino = None
-sam_predictor = None
-pipe = None
-def download_and_load_models():
-    global dino, sam_predictor, pipe
-    # ---- GroundingDINO ----
-    DINO_REPO = "IDEA-Research/GroundingDINO"
-    dino_cfg_path = hf_hub_download(
-        repo_id=DINO_REPO,
-        filename="groundingdino/config/GroundingDINO_SwinT_OGC.py",
-        token=HF_TOKEN if HF_TOKEN else None,
-    )
-    dino_ckpt_path = hf_hub_download(
-        repo_id=DINO_REPO,
-        filename="groundingdino_swint_ogc.pth",
-        token=HF_TOKEN if HF_TOKEN else None,
-    )
-    dino = load_model(dino_cfg_path, dino_ckpt_path)
-    # ---- SAM ----
-    SAM_REPO = "facebook/sam-vit-huge"
-    sam_ckpt_path = hf_hub_download(
-        repo_id=SAM_REPO,
-        filename="sam_vit_h_4b8939.pth",
-        token=HF_TOKEN if HF_TOKEN else None,
-    )
-    sam = sam_model_registry["vit_h"](checkpoint=sam_ckpt_path)
-    sam.to(DEVICE)
-    sam_predictor = SamPredictor(sam)
-    # ---- SDXL Inpaint ----
-    fp_kwargs = {"torch_dtype": DTYPE, "use_safetensors": True}
-    if HF_TOKEN:
-        fp_kwargs["token"] = HF_TOKEN
-    pipe = StableDiffusionXLInpaintPipeline.from_pretrained(INPAINT_MODEL, **fp_kwargs).to(DEVICE)
     try:
-        pipe.set_progress_bar_config(disable=True)
     except Exception:
         pass
-try:
-    download_and_load_models()
-    model_loaded = True
-except Exception as e:
-    model_loaded = False
-    load_error = repr(e)
 # ============================================================
 # Helpers
 # ============================================================
 def make_error_image(w: int, h: int) -> Image.Image:
     return Image.new("RGB", (int(w), int(h)), (18, 18, 22))
-def fit64(w: int, h: int):
-    w = max(256, (w // 64) * 64)
-    h = max(256, (h // 64) * 64)
-    return w, h
-def resize_rgb(img: Image.Image, w: int, h: int) -> Image.Image:
-    return img.convert("RGB").resize((w, h), Image.LANCZOS)
-def resize_mask(mask: Image.Image, w: int, h: int) -> Image.Image:
-    return mask.convert("L").resize((w, h), Image.NEAREST)
-def dilate_mask(mask_np: np.ndarray, radius: int) -> np.ndarray:
-    if radius <= 0:
-        return mask_np
-    import cv2
-    kernel = np.ones((radius * 2 + 1, radius * 2 + 1), np.uint8)
-    return cv2.dilate(mask_np, kernel, iterations=1)
-def largest_component(mask_np: np.ndarray) -> np.ndarray:
-    import cv2
-    num_labels, labels, stats, _ = cv2.connectedComponentsWithStats(mask_np, connectivity=8)
-    if num_labels <= 1:
-        return mask_np
-    largest = 1 + np.argmax(stats[1:, cv2.CC_STAT_AREA])
-    out = np.zeros_like(mask_np)
-    out[labels == largest] = 255
-    return out
-def detect_clothing_mask(
-    image: Image.Image,
-    clothing_query: str,
-    box_threshold: float,
-    text_threshold: float,
-    dilate_radius: int,
-    keep_largest: bool,
-):
-    if image is None:
         return None
-    img_rgb = image.convert("RGB")
-    w, h = img_rgb.size
-    img_np = np.array(img_rgb)
-    boxes, _, _ = predict(
-        model=dino,
-        image=img_np,
-        caption=clothing_query,
-        box_threshold=float(box_threshold),
-        text_threshold=float(text_threshold),
-    )
-    if boxes is None or len(boxes) == 0:
         return None
-    boxes_px = []
-    for b in boxes:
-        cx, cy, bw, bh = map(float, b)
-        x1 = int((cx - bw / 2.0) * w)
-        y1 = int((cy - bh / 2.0) * h)
-        x2 = int((cx + bw / 2.0) * w)
-        y2 = int((cy + bh / 2.0) * h)
-        x1 = max(0, min(w - 1, x1))
-        y1 = max(0, min(h - 1, y1))
-        x2 = max(0, min(w - 1, x2))
-        y2 = max(0, min(h - 1, y2))
-        if x2 > x1 and y2 > y1:
-            boxes_px.append([x1, y1, x2, y2])
-    if not boxes_px:
-        return None
-    sam_predictor.set_image(img_np)
-    full_mask = np.zeros((h, w), dtype=np.uint8)
-    for box in boxes_px:
-        box_arr = np.array(box, dtype=np.float32)
-        masks, _, _ = sam_predictor.predict(box=box_arr, multimask_output=False)
-        m = masks[0].astype(np.uint8) * 255
-        full_mask = np.maximum(full_mask, m)
-    if keep_largest:
-        full_mask = largest_component(full_mask)
-    full_mask = dilate_mask(full_mask, int(dilate_radius))
-    return Image.fromarray(full_mask, mode="L")
 # ============================================================
-# Core inference (no decorator here)
 # ============================================================
-def infer_core(
-    image,
     prompt,
     negative_prompt,
-    clothing_query,
     seed,
     randomize_seed,
     width,
     height,
     guidance_scale,
     num_inference_steps,
-    box_threshold,
-    text_threshold,
-    dilate_radius,
-    keep_largest,
 ):
     width = int(width)
     height = int(height)
     if not model_loaded:
         return make_error_image(width, height), f"Model load failed: {load_error}"
-    if image is None:
-        return make_error_image(width, height), "Error: upload an image."
     prompt = (prompt or "").strip()
     if not prompt:
         return make_error_image(width, height), "Error: prompt is empty."
-    neg = (negative_prompt or "").strip()
-    if not neg:
-        neg = None
-    clothing_query = (clothing_query or "").strip() or DEFAULT_CLOTHING_QUERY
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
-    else:
-        seed = int(seed)
-    generator = torch.Generator(device=DEVICE).manual_seed(seed)
-    width, height = fit64(width, height)
-    width = min(width, MAX_IMAGE_SIZE)
-    height = min(height, MAX_IMAGE_SIZE)
-    mask = detect_clothing_mask(
-        image=image,
-        clothing_query=clothing_query,
-        box_threshold=float(box_threshold),
-        text_threshold=float(text_threshold),
-        dilate_radius=int(dilate_radius),
-        keep_largest=bool(keep_largest),
-    )
-    if mask is None:
-        return image, f"Seed: {seed}. No clothing detected, try lowering thresholds or changing query."
-    img_resized = resize_rgb(image, width, height)
-    mask_resized = resize_mask(mask, width, height)
     status = f"Seed: {seed}"
-    if not CUDA_OK:
-        status += " | CPU only (slow)."
     try:
         with torch.inference_mode():
-            if CUDA_OK:
-                with torch.autocast("cuda", dtype=DTYPE):
-                    out = pipe(
-                        prompt=prompt,
-                        negative_prompt=neg,
-                        image=img_resized,
-                        mask_image=mask_resized,
-                        guidance_scale=float(guidance_scale),
-                        num_inference_steps=int(num_inference_steps),
-                        generator=generator,
-                    )
             else:
-                out = pipe(
-                    prompt=prompt,
-                    negative_prompt=neg,
-                    image=img_resized,
-                    mask_image=mask_resized,
-                    guidance_scale=float(guidance_scale),
-                    num_inference_steps=int(num_inference_steps),
-                    generator=generator,
-                )
-        return out.images[0], status
     except Exception as e:
         return make_error_image(width, height), f"Error: {type(e).__name__}: {e}"
     finally:
         gc.collect()
-        if CUDA_OK:
             torch.cuda.empty_cache()
-# ============================================================
-# IMPORTANT: Always define a @spaces.GPU function if spaces imports
-# (Spaces startup checker requires it)
-# ============================================================
 if SPACES_AVAILABLE:
     @spaces.GPU
     def infer(*args, **kwargs):
-        return infer_core(*args, **kwargs)
 else:
     def infer(*args, **kwargs):
-        return infer_core(*args, **kwargs)
 # ============================================================
-# UI
 # ============================================================
-CSS = "body { background: #000; color: #fff; }"
-with gr.Blocks(title="Auto Clothing Replacement") as demo:
     gr.HTML(f"<style>{CSS}</style>")
-    gr.Markdown("## Automatic Clothing Replacement (no paint, no manual mask)")
-    gr.Markdown("Upload a photo, describe the new clothing. Detection and masking is automatic.")
     if not model_loaded:
         gr.Markdown(f"⚠️ Model failed to load:\n\n{load_error}")
-    image = gr.Image(type="pil", label="Input image")
-    prompt = gr.Textbox(
-        label="Prompt (describe new clothing)",
-        lines=2,
-        placeholder="e.g., a navy business suit jacket, realistic fabric folds, studio lighting",
-    )
-    negative_prompt = gr.Textbox(
-        label="Negative prompt (optional)",
-        lines=2,
-        placeholder="e.g., blurry, deformed, low quality",
-    )
-    run = gr.Button("Replace Clothing")
-    out_img = gr.Image(label="Result")
     status = gr.Markdown("")
-    with gr.Accordion("Advanced settings", open=False):
-        clothing_query = gr.Textbox(label="Detection query", value=DEFAULT_CLOTHING_QUERY)
         seed = gr.Slider(0, MAX_SEED, step=1, value=0, label="Seed")
         randomize_seed = gr.Checkbox(value=True, label="Randomize seed")
-        width = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=768 if not CUDA_OK else 1024, label="Width")
-        height = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=768 if not CUDA_OK else 1024, label="Height")
-        guidance_scale = gr.Slider(0.0, 15.0, step=0.1, value=7.0, label="Guidance scale")
-        num_inference_steps = gr.Slider(1, 80, step=1, value=30, label="Steps")
-        box_threshold = gr.Slider(0.05, 0.90, step=0.01, value=DEFAULT_BOX_THRESHOLD, label="Box threshold (DINO)")
-        text_threshold = gr.Slider(0.05, 0.90, step=0.01, value=DEFAULT_TEXT_THRESHOLD, label="Text threshold (DINO)")
-        dilate_radius = gr.Slider(0, 30, step=1, value=8, label="Mask dilation radius")
-        keep_largest = gr.Checkbox(value=True, label="Keep only largest region")
-    run.click(
         fn=infer,
         inputs=[
-            image,
             prompt,
             negative_prompt,
-            clothing_query,
             seed,
             randomize_seed,
             width,
             height,
             guidance_scale,
             num_inference_steps,
-            box_threshold,
-            text_threshold,
-            dilate_radius,
-            keep_largest,
         ],
-        outputs=[out_img, status],
     )
 if __name__ == "__main__":
-    demo.queue().launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)

 # app.py
 # ============================================================
+# IMPORTANT: imports order matters for Hugging Face Spaces
 # ============================================================
 import os
 import random
 import warnings
 import logging
+import inspect
+# ---- Spaces GPU decorator (must be imported early) ----------
+try:
+    import spaces  # noqa: F401
+    SPACES_AVAILABLE = True
+except Exception:
+    SPACES_AVAILABLE = False
 import gradio as gr
+import numpy as np
 from PIL import Image
 import torch
+from huggingface_hub import login
 # ============================================================
+# Try importing Z-Image pipelines (requires diffusers>=0.36.0)
 # ============================================================
+ZIMAGE_AVAILABLE = True
+ZIMAGE_IMPORT_ERROR = None
+try:
+    from diffusers import (
+        ZImagePipeline,
+        ZImageImg2ImgPipeline,
+        FlowMatchEulerDiscreteScheduler,
+    )
+except Exception as e:
+    ZIMAGE_AVAILABLE = False
+    ZIMAGE_IMPORT_ERROR = repr(e)
 # ============================================================
 # Config
 # ============================================================
+MODEL_PATH = os.environ.get("MODEL_PATH", "telcom/dee-z-image").strip()
+ATTENTION_BACKEND = os.environ.get("ATTENTION_BACKEND", "flash_3").strip()
+ENABLE_COMPILE = os.environ.get("ENABLE_COMPILE", "false").lower() == "true"
 HF_TOKEN = os.getenv("HF_TOKEN", "").strip()
 if HF_TOKEN:
     login(token=HF_TOKEN)
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+warnings.filterwarnings("ignore")
+logging.getLogger("transformers").setLevel(logging.ERROR)
+MAX_SEED = np.iinfo(np.int32).max
+# ============================================================
+# Device & dtype
+# ============================================================
+cuda_available = torch.cuda.is_available()
+device = torch.device("cuda" if cuda_available else "cpu")
+if cuda_available and hasattr(torch.cuda, "is_bf16_supported") and torch.cuda.is_bf16_supported():
+    dtype = torch.bfloat16
+elif cuda_available:
+    dtype = torch.float16
+else:
+    dtype = torch.float32
+MAX_IMAGE_SIZE = 1536 if cuda_available else 768
+fallback_msg = ""
+if not cuda_available:
+    fallback_msg = "GPU unavailable. Running in CPU fallback mode (slow)."
 # ============================================================
+# Load pipelines
 # ============================================================
+pipe_txt2img = None
+pipe_img2img = None
 model_loaded = False
 load_error = None
+def _set_attention_backend_best_effort(p):
+    try:
+        if hasattr(p, "transformer") and hasattr(p.transformer, "set_attention_backend"):
+            p.transformer.set_attention_backend(ATTENTION_BACKEND)
+    except Exception:
+        pass
+def _compile_best_effort(p):
+    if not (ENABLE_COMPILE and device.type == "cuda"):
+        return
     try:
+        if hasattr(p, "transformer"):
+            p.transformer = torch.compile(
+                p.transformer,
+                mode="max-autotune-no-cudagraphs",
+                fullgraph=False,
+            )
     except Exception:
         pass
+if ZIMAGE_AVAILABLE:
+    try:
+        fp_kwargs = {
+            "torch_dtype": dtype,
+            "use_safetensors": True,
+        }
+        if HF_TOKEN:
+            fp_kwargs["token"] = HF_TOKEN
+        pipe_txt2img = ZImagePipeline.from_pretrained(MODEL_PATH, **fp_kwargs).to(device)
+        _set_attention_backend_best_effort(pipe_txt2img)
+        _compile_best_effort(pipe_txt2img)
+        try:
+            pipe_txt2img.set_progress_bar_config(disable=True)
+        except Exception:
+            pass
+        # Share weights/components with img2img pipeline
+        pipe_img2img = ZImageImg2ImgPipeline(**pipe_txt2img.components).to(device)
+        _set_attention_backend_best_effort(pipe_img2img)
+        try:
+            pipe_img2img.set_progress_bar_config(disable=True)
+        except Exception:
+            pass
+        model_loaded = True
+    except Exception as e:
+        load_error = repr(e)
+        model_loaded = False
+else:
+    load_error = (
+        "Z-Image pipelines not available in your diffusers install.\n\n"
+        f"Import error:\n{ZIMAGE_IMPORT_ERROR}\n\n"
+        "Fix: set requirements.txt to diffusers==0.36.0 (or install Diffusers from source)."
+    )
+    model_loaded = False
 # ============================================================
 # Helpers
 # ============================================================
 def make_error_image(w: int, h: int) -> Image.Image:
     return Image.new("RGB", (int(w), int(h)), (18, 18, 22))
+def prep_init_image(img: Image.Image, width: int, height: int) -> Image.Image:
+    if img is None:
         return None
+    if not isinstance(img, Image.Image):
         return None
+    img = img.convert("RGB")
+    if img.size != (width, height):
+        img = img.resize((width, height), Image.LANCZOS)
+    return img
+def _call_pipeline(pipe, kwargs: dict):
+    """
+    Robust call: only pass kwargs the pipeline actually accepts.
+    This avoids crashes if a particular build does not support negative_prompt, etc.
+    """
+    try:
+        sig = inspect.signature(pipe.__call__)
+        allowed = set(sig.parameters.keys())
+        filtered = {k: v for k, v in kwargs.items() if k in allowed and v is not None}
+        return pipe(**filtered)
+    except Exception:
+        # Fallback: try raw kwargs (some pipelines use **kwargs internally)
+        return pipe(**{k: v for k, v in kwargs.items() if v is not None})
 # ============================================================
+# Inference
 # ============================================================
+def _infer_impl(
     prompt,
     negative_prompt,
     seed,
     randomize_seed,
     width,
     height,
     guidance_scale,
     num_inference_steps,
+    shift,
+    max_sequence_length,
+    init_image,
+    strength,
 ):
     width = int(width)
     height = int(height)
+    seed = int(seed)
     if not model_loaded:
         return make_error_image(width, height), f"Model load failed: {load_error}"
     prompt = (prompt or "").strip()
     if not prompt:
         return make_error_image(width, height), "Error: prompt is empty."
     if randomize_seed:
         seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(device=device).manual_seed(seed)
     status = f"Seed: {seed}"
+    if fallback_msg:
+        status += f" | {fallback_msg}"
+    gs = float(guidance_scale)
+    steps = int(num_inference_steps)
+    msl = int(max_sequence_length)
+    st = float(strength)
+    neg = (negative_prompt or "").strip()
+    if not neg:
+        neg = None
+    init_image = prep_init_image(init_image, width, height)
+    # Update scheduler (shift) per run
     try:
+        scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=1000, shift=float(shift))
+        pipe_txt2img.scheduler = scheduler
+        pipe_img2img.scheduler = scheduler
+    except Exception:
+        pass
+    try:
+        base_kwargs = dict(
+            prompt=prompt,
+            height=height,
+            width=width,
+            guidance_scale=gs,
+            num_inference_steps=steps,
+            generator=generator,
+            max_sequence_length=msl,
+        )
+        # only passed if supported by the pipeline
+        if neg is not None:
+            base_kwargs["negative_prompt"] = neg
         with torch.inference_mode():
+            if device.type == "cuda":
+                with torch.autocast("cuda", dtype=dtype):
+                    if init_image is not None:
+                        out = _call_pipeline(
+                            pipe_img2img,
+                            {**base_kwargs, "image": init_image, "strength": st},
+                        )
+                    else:
+                        out = _call_pipeline(pipe_txt2img, base_kwargs)
             else:
+                if init_image is not None:
+                    out = _call_pipeline(
+                        pipe_img2img,
+                        {**base_kwargs, "image": init_image, "strength": st},
+                    )
+                else:
+                    out = _call_pipeline(pipe_txt2img, base_kwargs)
+        img = out.images[0]
+        return img, status
     except Exception as e:
         return make_error_image(width, height), f"Error: {type(e).__name__}: {e}"
     finally:
         gc.collect()
+        if device.type == "cuda":
             torch.cuda.empty_cache()
 if SPACES_AVAILABLE:
     @spaces.GPU
     def infer(*args, **kwargs):
+        return _infer_impl(*args, **kwargs)
 else:
     def infer(*args, **kwargs):
+        return _infer_impl(*args, **kwargs)
 # ============================================================
+# UI (simple black style like your SDXL example)
 # ============================================================
+CSS = """
+body {
+    background: #000;
+    color: #fff;
+}
+"""
+with gr.Blocks(title="Z-Image txt2img + img2img") as demo:
     gr.HTML(f"<style>{CSS}</style>")
+    if fallback_msg:
+        gr.Markdown(f"**{fallback_msg}**")
     if not model_loaded:
         gr.Markdown(f"⚠️ Model failed to load:\n\n{load_error}")
+    gr.Markdown("## Z-Image Generator (txt2img + img2img)")
+    prompt = gr.Textbox(label="Prompt", lines=2)
+    init_image = gr.Image(label="Initial image (optional)", type="pil")
+    run_button = gr.Button("Generate")
+    result = gr.Image(label="Result")
     status = gr.Markdown("")
+    with gr.Accordion("Advanced Settings", open=False):
+        negative_prompt = gr.Textbox(label="Negative prompt (optional)")
         seed = gr.Slider(0, MAX_SEED, step=1, value=0, label="Seed")
         randomize_seed = gr.Checkbox(value=True, label="Randomize seed")
+        width = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=1024, label="Width")
+        height = gr.Slider(256, MAX_IMAGE_SIZE, step=64, value=1024, label="Height")
+        guidance_scale = gr.Slider(0.0, 10.0, step=0.1, value=0.0, label="Guidance scale")
+        num_inference_steps = gr.Slider(1, 100, step=1, value=8, label="Steps")
+        shift = gr.Slider(1.0, 10.0, step=0.1, value=3.0, label="Time shift")
+        max_sequence_length = gr.Slider(64, 512, step=64, value=512, label="Max sequence length")
+        strength = gr.Slider(0.0, 1.0, step=0.05, value=0.6, label="Image strength (img2img)")
+    run_button.click(
         fn=infer,
         inputs=[
             prompt,
             negative_prompt,
             seed,
             randomize_seed,
             width,
             height,
             guidance_scale,
             num_inference_steps,
+            shift,
+            max_sequence_length,
+            init_image,
+            strength,
         ],
+        outputs=[result, status],
     )
 if __name__ == "__main__":
+    demo.queue().launch(ssr_mode=False)