Spaces:

buxiangzhiren
/

GeoRemover

Running on Zero

App Files Files Community

zixinz commited on Oct 4

Commit

134053b

1 Parent(s): 5f25c59

chore: ignore pyc and pycache

Browse files

Files changed (1) hide show

app.py +257 -75

app.py CHANGED Viewed

@@ -1,4 +1,3 @@
 import gradio as gr
 import spaces
 import sys, pathlib
@@ -14,6 +13,12 @@ from diffusers.pipelines.flux.pipeline_flux_fill_unmasked_image_condition_versio
     FluxFillPipeline_token12_depth_only as FluxFillPipeline,
 )
 import os
 import sys
 import pathlib
@@ -21,7 +26,6 @@ import subprocess
 import random
 from typing import Optional, Tuple
 import torch
 from PIL import Image, ImageOps
 import numpy as np
@@ -63,7 +67,7 @@ def _ensure_executable(p: pathlib.Path):
 def ensure_assets_if_missing():
     if os.getenv("SKIP_ASSET_DOWNLOAD") == "1":
-        print("↪️  SKIP_ASSET_DOWNLOAD=1 -> 跳过资产下载检查")
         return
     if _have_all_assets():
         print("✅ Assets already present")
@@ -89,6 +93,9 @@ except Exception as e:
 # ---------------- Global singletons ----------------
 _MODELS: dict[str, DepthModel] = {}
 _PIPE: Optional[FluxFillPipeline] = None
 def get_model(encoder: str) -> DepthModel:
     if encoder not in _MODELS:
@@ -103,14 +110,11 @@ def get_pipe() -> FluxFillPipeline:
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = torch.bfloat16 if device == "cuda" else torch.float32
-    local_flux = BASE_DIR / "code_edit" / "flux_cache"
     use_local = local_flux.exists()
     hf_token = os.environ.get("HF_TOKEN")
     try:
         from huggingface_hub import hf_hub_enable_hf_transfer
         hf_hub_enable_hf_transfer()
@@ -124,12 +128,11 @@ def get_pipe() -> FluxFillPipeline:
                 local_flux, torch_dtype=dtype
             ).to(device)
         else:
-            # 在线拉取（需要 gated 访问 + token）
             pipe = FluxFillPipeline.from_pretrained(
                 "black-forest-labs/FLUX.1-Fill-dev",
                 torch_dtype=dtype,
-                token=hf_token,
-                # use_auth_token=hf_token,
             ).to(device)
     except Exception as e:
         raise RuntimeError(
@@ -140,25 +143,25 @@ def get_pipe() -> FluxFillPipeline:
     # -------- LoRA (stage1) --------
     lora_dir = CODE_EDIT / "stage1" / "checkpoint-4800"
-    lora_file = "pytorch_lora_weights.safetensors"   # 你的实际文件名
     adapter_name = "stage1"
     if lora_dir.exists():
         try:
-            import peft  # just to assert backend is present
             print(f"[pipe] loading LoRA from: {lora_dir}/{lora_file}")
             pipe.load_lora_weights(
                 str(lora_dir),
-                weight_name=lora_file,          # 关键：指定文件名
-                adapter_name=adapter_name       # 给一个可切换的名字
             )
-            # 新版 diffusers：优先 set_adapters
             try:
                 pipe.set_adapters(adapter_name, scale=1.0)
                 print(f"[pipe] set_adapters('{adapter_name}', scale=1.0)")
             except Exception as e_set:
                 print(f"[pipe] set_adapters not available ({e_set}); trying fuse_lora()")
-                # 旧版/或不支持 set_adapters 的 pipeline：融合 LoRA
                 try:
                     pipe.fuse_lora(lora_scale=1.0)
                     print("[pipe] fuse_lora(lora_scale=1.0) done")
@@ -175,39 +178,126 @@ def get_pipe() -> FluxFillPipeline:
     _PIPE = pipe
     return pipe
 # ---------------- Mask helpers ----------------
 def to_grayscale_mask(im: Image.Image) -> Image.Image:
     """
-    将任意 RGBA/RGB/L 的图转为 L。
-    输出：白=需要移除/填充区域，黑=保留。
     """
     if im.mode == "RGBA":
         mask = im.split()[-1]     # alpha as mask
     else:
         mask = im.convert("L")
-    # 简单二值化，去噪
     mask = mask.point(lambda p: 255 if p > 16 else 0)
-    return mask  # 不做 invert，白色=mask区域
 def dilate_mask(mask_l: Image.Image, px: int) -> Image.Image:
-    """对白色区域做膨胀；px 约等于扩大像素。"""
     if px <= 0:
         return mask_l
     arr = np.array(mask_l, dtype=np.uint8)
     kernel = np.ones((3, 3), np.uint8)
-    iters = max(1, int(px // 2))  # 经验
     dilated = cv2.dilate(arr, kernel, iterations=iters)
     return Image.fromarray(dilated, mode="L")
 def _mask_from_red(img: Image.Image, out_size: Tuple[int, int]) -> Image.Image:
     """
-    从一张 RGBA/RGB 图里提取“纯红笔迹”为二值蒙版（白=画笔，黑=其他）。
-    阈值稍微宽一点以容忍压缩/插值。
     """
     arr = np.array(img.convert("RGBA"))
     r, g, b, a = arr[..., 0], arr[..., 1], arr[..., 2], arr[..., 3]
-    # 条件：红高、绿低、蓝低、且 alpha>0
     red_hit = (r >= 200) & (g <= 40) & (b <= 40) & (a > 0)
     mask = (red_hit.astype(np.uint8) * 255)
@@ -221,27 +311,27 @@ def pick_mask(
     dilate_px: int = 0,
 ) -> Optional[Image.Image]:
     """
-    规则：
-      1) 若用户上传了 mask：直接用（白=mask）
-      2) 否则从 ImageEditor 返回里只“认红色笔迹”为 mask：
-         - 先看 sketch_data['mask']（有些版本会给）
-         - 不然遍历 sketch_data['layers'][*]['image']，合并其中的红色笔迹
-         - 若还没有，再退到 sketch_data['composite'] 里找红色笔迹
     """
-    # 1) 上传优先
     if isinstance(upload_mask, Image.Image):
         m = to_grayscale_mask(upload_mask).resize(base_image.size, Image.NEAREST)
         return dilate_mask(m, dilate_px) if dilate_px > 0 else m
-    # 2) 手绘（ImageEditor）
     if isinstance(sketch_data, dict):
-        # 2a) 显式 mask（仍然支持）
         m = sketch_data.get("mask")
         if isinstance(m, Image.Image):
             m = to_grayscale_mask(m).resize(base_image.size, Image.NEAREST)
             return dilate_mask(m, dilate_px) if dilate_px > 0 else m
-        # 2b) 从 layers 里合并红色笔迹
         layers = sketch_data.get("layers")
         acc = None
         if isinstance(layers, list) and layers:
@@ -252,28 +342,28 @@ def pick_mask(
                 li = lyr.get("image") or lyr.get("mask")
                 if isinstance(li, Image.Image):
                     m_layer = _mask_from_red(li, base_image.size)
-                    # 合并：有任一层画过就算 mask
                     acc = ImageOps.lighter(acc, m_layer)
             if acc.getbbox() is not None:
                 return dilate_mask(acc, dilate_px) if dilate_px > 0 else acc
-        # 2c) 最后从 composite 里找红色笔迹
         comp = sketch_data.get("composite")
         if isinstance(comp, Image.Image):
             m_comp = _mask_from_red(comp, base_image.size)
             if m_comp.getbbox() is not None:
                 return dilate_mask(m_comp, dilate_px) if dilate_px > 0 else m_comp
-    # 3) 没拿到就返回 None（后面会提示“需要掩码”）
     return None
 def _round_mult64(x: float, mode: str = "nearest") -> int:
     """
-    把 x 对齐到 64 的倍数：
-      - mode="ceil"    向上取整
-      - mode="floor"   向下取整
-      - mode="nearest" 最近的倍数
     """
     if mode == "ceil":
         return int((x + 63) // 64) * 64
@@ -284,20 +374,20 @@ def _round_mult64(x: float, mode: str = "nearest") -> int:
 def prepare_size_for_flux(img: Image.Image, target_max: int = 1024) -> tuple[int, int]:
     """
-    步骤：
-    1) 先把原始 w,h 向上对齐到 64 的倍数（避免小图过小）
-    2) 把长边固定为 target_max(默认1024)
-    3) 短边按比例缩放并对齐到 64 的倍数（至少 64）
     """
     w, h = img.size
-    # 1) 先各自向上对齐到 64 的倍数
     w1 = max(64, _round_mult64(w, mode="ceil"))
     h1 = max(64, _round_mult64(h, mode="ceil"))
-    # 2) 固定长边为 target_max，短边按比例
     if w1 >= h1:
-        out_w = target_max  # 长边固定 1024
         scaled_h = h1 * (target_max / w1)
         out_h = max(64, _round_mult64(scaled_h, mode="nearest"))
     else:
@@ -306,21 +396,22 @@ def prepare_size_for_flux(img: Image.Image, target_max: int = 1024) -> tuple[int
         out_w = max(64, _round_mult64(scaled_w, mode="nearest"))
     return int(out_w), int(out_h)
 @spaces.GPU
-# ---------------- Preview depth for canvas (彩色) ----------------
 def preview_depth(image: Optional[Image.Image], encoder: str, max_res: int, input_size: int, fp32: bool):
     if image is None:
         return None
     dm = get_model(encoder)
-    # 彩色可视化（RGB），严格按你之前的 colormap 风格
     d_rgb = dm.infer(image=image, max_res=max_res, input_size=input_size, fp32=fp32, grayscale=False)
     return d_rgb
 def prepare_canvas(image, depth_img, source):
     base = depth_img if source == "depth" else image
     if base is None:
-        raise gr.Error("请先上传图片（并等待深度预览出来），再点击\"Prepare canvas\"。")
-    # 对 ImageEditor 用通用的 gr.update 来设置 value
     return gr.update(value=base)
 # ---------------- Two-stage pipeline: depth(color) -> fill ----------------
@@ -341,9 +432,9 @@ def run_depth_and_fill(
     seed: Optional[int],
 ) -> Tuple[Image.Image, Image.Image]:
     if image is None:
-        raise gr.Error("请先上传一张图片。")
-    # 1) 生成彩色深度图（RGB）
     depth_model = get_model(encoder)
     depth_rgb: Image.Image = depth_model.infer(
         image=image, max_res=max_res, input_size=input_size, fp32=fp32, grayscale=False
@@ -351,26 +442,26 @@ def run_depth_and_fill(
     print(f"[DEBUG] Depth RGB: mode={depth_rgb.mode}, size={depth_rgb.size}")
-    # 2) 提取 mask（上传 > 手绘）
     mask_l = pick_mask(mask_upload, sketch, image, dilate_px=mask_dilate_px)
     if (mask_l is None) or (mask_l.getbbox() is None):
-        raise gr.Error("没有检测到有效的 mask：请确认已在画布上涂抹或上传 mask 图片。")
     print(f"[DEBUG] Mask: mode={mask_l.mode}, size={mask_l.size}, bbox={mask_l.getbbox()}")
-    # 3) 确定输出尺寸
     width, height = prepare_size_for_flux(depth_rgb, target_max=max_side)
     orig_w, orig_h = image.size
     print(f"[DEBUG] FLUX size: {width}x{height}, original: {orig_w}x{orig_h}")
-    # 4) 运行 FLUX pipeline
-    # 关键修复：image 参数应该传入 depth_rgb 而不是原图
     pipe = get_pipe()
     generator = torch.Generator("cpu").manual_seed(int(seed)) if (seed is not None and seed >= 0) else torch.Generator("cpu").manual_seed(random.randint(0, 2**31 - 1))
     result = pipe(
         prompt=prompt,
-        image=depth_rgb,           # 修复：传入彩色深度图而不是原图
         mask_image=mask_l,
         width=width,
         height=height,
@@ -378,26 +469,101 @@ def run_depth_and_fill(
         num_inference_steps=int(steps),
         max_sequence_length=512,
         generator=generator,
-        depth=depth_rgb,           # depth 参数也传入彩色深度图
     ).images[0]
     final_result = result.resize((orig_w, orig_h), Image.BICUBIC)
-    # 返回结果和 mask 预览
     mask_preview = mask_l.resize((orig_w, orig_h), Image.NEAREST).convert("RGB")
     return final_result, mask_preview
 # ---------------- UI ----------------
 with gr.Blocks() as demo:
-    gr.Markdown("## GeoRemover · Depth Removal (Depth(color) → FLUX Fill)")
     with gr.Row():
         with gr.Column(scale=1):
-            # 输入图
             img = gr.Image(label="Upload image", type="pil")
-            # Mask 两种方式：上传 or 画
             with gr.Tab("Upload mask"):
                 mask_upload = gr.Image(label="Mask (optional)", type="pil")
@@ -407,15 +573,14 @@ with gr.Blocks() as demo:
                 sketch = gr.ImageEditor(
                     label="Sketch mask (draw with brush)",
                     type="pil",
-                    # 画笔只给纯红，方便我们精确提取笔迹
                     brush=gr.Brush(colors=["#FF0000"], default_size=24)
                 )
             # prompt
             prompt = gr.Textbox(label="Prompt", value="A beautiful scene")
-            # 可调参数
             with gr.Accordion("Advanced (Depth & FLUX)", open=False):
                 encoder = gr.Dropdown(["vits", "vitl"], value="vitl", label="Depth encoder")
                 max_res = gr.Slider(512, 2048, value=1280, step=64, label="Depth: max_res")
@@ -425,30 +590,36 @@ with gr.Blocks() as demo:
                 mask_dilate_px = gr.Slider(0, 128, value=0, step=1, label="Mask dilation (px)")
                 guidance_scale = gr.Slider(0, 50, value=30, step=0.5, label="FLUX: guidance_scale")
                 steps = gr.Slider(10, 75, value=50, step=1, label="FLUX: steps")
-                seed = gr.Number(value=0, precision=0, label="Seed (>=0 固定；留空随机)")
             run_btn = gr.Button("Run", variant="primary")
         with gr.Column(scale=1):
             depth_preview = gr.Image(label="Depth preview (colored)", interactive=False)
-            mask_preview = gr.Image(label="Mask preview (what will be removed)", interactive=False)
             out = gr.Image(label="Output")
-    # 事件：上传图片后生成"彩色深度预览"
     img.change(
         fn=preview_depth,
         inputs=[img, encoder, max_res, input_size, fp32],
         outputs=[depth_preview],
     )
-    # 准备画布：把原图或"彩色深度图"放进 ImageEditor
     prepare_btn.click(
         fn=prepare_canvas,
         inputs=[img, depth_preview, draw_source],
         outputs=[sketch],
     )
-    # 运行
     run_btn.click(
         fn=run_depth_and_fill,
         inputs=[img, mask_upload, sketch, prompt, encoder, max_res, input_size, fp32,
@@ -457,6 +628,17 @@ with gr.Blocks() as demo:
         api_name="run",
     )
 if __name__ == "__main__":
     os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 import gradio as gr
 import spaces
 import sys, pathlib
     FluxFillPipeline_token12_depth_only as FluxFillPipeline,
 )
+# ==== STAGE-2 ONLY ADDED: import Stage-2 Pipeline (do not touch Stage-1) ====
+from diffusers.pipelines.flux.pipeline_flux_fill_unmasked_image_condition_version import (
+    FluxFillPipeline_token12_depth as FluxFillPipelineStage2,
+)
+# ===========================================================================
 import os
 import sys
 import pathlib
 import random
 from typing import Optional, Tuple
 import torch
 from PIL import Image, ImageOps
 import numpy as np
 def ensure_assets_if_missing():
     if os.getenv("SKIP_ASSET_DOWNLOAD") == "1":
+        print("↪️  SKIP_ASSET_DOWNLOAD=1 -> skip asset download check")
         return
     if _have_all_assets():
         print("✅ Assets already present")
 # ---------------- Global singletons ----------------
 _MODELS: dict[str, DepthModel] = {}
 _PIPE: Optional[FluxFillPipeline] = None
+# ==== STAGE-2 ONLY ADDED: singleton ====
+_PIPE_STAGE2: Optional[FluxFillPipelineStage2] = None
+# ======================================
 def get_model(encoder: str) -> DepthModel:
     if encoder not in _MODELS:
     device = "cuda" if torch.cuda.is_available() else "cpu"
     dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    local_flux = BASE_DIR / "code_edit" / "flux_cache"
     use_local = local_flux.exists()
     hf_token = os.environ.get("HF_TOKEN")
     try:
         from huggingface_hub import hf_hub_enable_hf_transfer
         hf_hub_enable_hf_transfer()
                 local_flux, torch_dtype=dtype
             ).to(device)
         else:
+            # Fetch online (requires gated access + token)
             pipe = FluxFillPipeline.from_pretrained(
                 "black-forest-labs/FLUX.1-Fill-dev",
                 torch_dtype=dtype,
+                token=hf_token,
             ).to(device)
     except Exception as e:
         raise RuntimeError(
     # -------- LoRA (stage1) --------
     lora_dir = CODE_EDIT / "stage1" / "checkpoint-4800"
+    lora_file = "pytorch_lora_weights.safetensors"   # your actual file name
     adapter_name = "stage1"
     if lora_dir.exists():
         try:
+            import peft  # assert backend is present
             print(f"[pipe] loading LoRA from: {lora_dir}/{lora_file}")
             pipe.load_lora_weights(
                 str(lora_dir),
+                weight_name=lora_file,          # important: specify filename
+                adapter_name=adapter_name       # a switchable name
             )
+            # Newer diffusers prefer set_adapters
             try:
                 pipe.set_adapters(adapter_name, scale=1.0)
                 print(f"[pipe] set_adapters('{adapter_name}', scale=1.0)")
             except Exception as e_set:
                 print(f"[pipe] set_adapters not available ({e_set}); trying fuse_lora()")
+                # Older / pipelines without set_adapters: fuse LoRA
                 try:
                     pipe.fuse_lora(lora_scale=1.0)
                     print("[pipe] fuse_lora(lora_scale=1.0) done")
     _PIPE = pipe
     return pipe
+# ==== STAGE-2 ONLY ADDED: Stage-2 loader (no change to Stage-1 logic) ====
+def get_pipe_stage2() -> FluxFillPipelineStage2:
+    """
+    Load Stage-2 FluxFillPipeline_token12_depth and mount the Stage-2 LoRA.
+    """
+    global _PIPE_STAGE2
+    if _PIPE_STAGE2 is not None:
+        return _PIPE_STAGE2
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    local_flux = BASE_DIR / "code_edit" / "flux_cache"
+    use_local = local_flux.exists()
+    hf_token = os.environ.get("HF_TOKEN")
+    try:
+        from huggingface_hub import hf_hub_enable_hf_transfer
+        hf_hub_enable_hf_transfer()
+    except Exception:
+        pass
+    print(f"[stage2] loading FLUX.1-Fill-dev (dtype={dtype}, device={device}, local={use_local})")
+    try:
+        if use_local:
+            pipe2 = FluxFillPipelineStage2.from_pretrained(local_flux, torch_dtype=dtype).to(device)
+        else:
+            pipe2 = FluxFillPipelineStage2.from_pretrained(
+                "black-forest-labs/FLUX.1-Fill-dev",
+                torch_dtype=dtype,
+                token=hf_token,
+            ).to(device)
+    except Exception as e:
+        raise RuntimeError("Stage-2: Failed to load FLUX.1-Fill-dev.") from e
+    # Load Stage-2 LoRA
+    lora_dir2 = CODE_EDIT / "stage2" / "checkpoint-20000"
+    candidate_names = [
+        "pytorch_lora_weights.safetensors",
+        "adapter_model.safetensors",
+        "lora.safetensors",
+    ]
+    weight_name = None
+    for name in candidate_names:
+        if (lora_dir2 / name).is_file():
+            weight_name = name
+            break
+    if not lora_dir2.exists():
+        raise RuntimeError(f"Stage-2 LoRA dir not found: {lora_dir2}")
+    if weight_name is None:
+        raise RuntimeError(
+            f"Stage-2 LoRA weight not found under {lora_dir2}. "
+            f"Tried: {candidate_names}"
+        )
+    try:
+        import peft  # noqa: F401
+    except Exception as e:
+        raise RuntimeError(
+            "peft is not installed (requires peft>=0.11 to load LoRA)."
+        ) from e
+    try:
+        print(f"[stage2] loading LoRA: {lora_dir2}/{weight_name}")
+        pipe2.load_lora_weights(
+            str(lora_dir2),
+            weight_name=weight_name,
+            adapter_name="stage2",
+        )
+        try:
+            pipe2.set_adapters("stage2", scale=1.0)
+            print("[stage2] set_adapters('stage2', 1.0)")
+        except Exception as e_set:
+            print(f"[stage2] set_adapters not available ({e_set}); trying fuse_lora()")
+            try:
+                pipe2.fuse_lora(lora_scale=1.0)
+                print("[stage2] fuse_lora(lora_scale=1.0) done")
+            except Exception as e_fuse:
+                raise RuntimeError(f"Stage-2 fuse_lora failed: {e_fuse}") from e_fuse
+    except Exception as e:
+        raise RuntimeError(f"Stage-2 LoRA load failed: {e}") from e
+    _PIPE_STAGE2 = pipe2
+    return pipe2
+# ==========================================================================
 # ---------------- Mask helpers ----------------
 def to_grayscale_mask(im: Image.Image) -> Image.Image:
     """
+    Convert any RGBA/RGB/L image to L mode.
+    Output: white = region to remove/fill, black = keep.
     """
     if im.mode == "RGBA":
         mask = im.split()[-1]     # alpha as mask
     else:
         mask = im.convert("L")
+    # simple binarization & denoise
     mask = mask.point(lambda p: 255 if p > 16 else 0)
+    return mask  # do not invert; white = mask region
 def dilate_mask(mask_l: Image.Image, px: int) -> Image.Image:
+    """Dilate white region by ~px pixels."""
     if px <= 0:
         return mask_l
     arr = np.array(mask_l, dtype=np.uint8)
     kernel = np.ones((3, 3), np.uint8)
+    iters = max(1, int(px // 2))  # heuristic
     dilated = cv2.dilate(arr, kernel, iterations=iters)
     return Image.fromarray(dilated, mode="L")
 def _mask_from_red(img: Image.Image, out_size: Tuple[int, int]) -> Image.Image:
     """
+    Extract "pure red strokes" as a binary mask (white=brush, black=others) from an RGBA/RGB image.
+    Thresholds are a bit lenient to tolerate compression/resampling.
     """
     arr = np.array(img.convert("RGBA"))
     r, g, b, a = arr[..., 0], arr[..., 1], arr[..., 2], arr[..., 3]
+    # condition: high red, low green/blue, and alpha>0
     red_hit = (r >= 200) & (g <= 40) & (b <= 40) & (a > 0)
     mask = (red_hit.astype(np.uint8) * 255)
     dilate_px: int = 0,
 ) -> Optional[Image.Image]:
     """
+    Rules:
+      1) If user uploaded a mask: use it directly (white=mask)
+      2) Otherwise, from ImageEditor output, only recognize "red strokes" as mask:
+         - Try sketch_data['mask'] first (some versions provide it)
+         - Else merge red strokes from sketch_data['layers'][*]['image']
+         - If still none, try sketch_data['composite'] for red strokes
     """
+    # 1) Uploaded mask has highest priority
     if isinstance(upload_mask, Image.Image):
         m = to_grayscale_mask(upload_mask).resize(base_image.size, Image.NEAREST)
         return dilate_mask(m, dilate_px) if dilate_px > 0 else m
+    # 2) Hand-drawn (ImageEditor)
     if isinstance(sketch_data, dict):
+        # 2a) explicit mask (still supported)
         m = sketch_data.get("mask")
         if isinstance(m, Image.Image):
             m = to_grayscale_mask(m).resize(base_image.size, Image.NEAREST)
             return dilate_mask(m, dilate_px) if dilate_px > 0 else m
+        # 2b) merge red strokes from layers
         layers = sketch_data.get("layers")
         acc = None
         if isinstance(layers, list) and layers:
                 li = lyr.get("image") or lyr.get("mask")
                 if isinstance(li, Image.Image):
                     m_layer = _mask_from_red(li, base_image.size)
+                    # merge: any layer with strokes contributes to mask
                     acc = ImageOps.lighter(acc, m_layer)
             if acc.getbbox() is not None:
                 return dilate_mask(acc, dilate_px) if dilate_px > 0 else acc
+        # 2c) finally, search composite for red strokes
         comp = sketch_data.get("composite")
         if isinstance(comp, Image.Image):
             m_comp = _mask_from_red(comp, base_image.size)
             if m_comp.getbbox() is not None:
                 return dilate_mask(m_comp, dilate_px) if dilate_px > 0 else m_comp
+    # 3) still none -> return None (caller will prompt for a mask)
     return None
 def _round_mult64(x: float, mode: str = "nearest") -> int:
     """
+    Align x to a multiple of 64:
+      - mode="ceil"    round up
+      - mode="floor"   round down
+      - mode="nearest" nearest multiple
     """
     if mode == "ceil":
         return int((x + 63) // 64) * 64
 def prepare_size_for_flux(img: Image.Image, target_max: int = 1024) -> tuple[int, int]:
     """
+    Steps:
+    1) First round w,h up to multiples of 64 (avoid too-small sizes)
+    2) Fix the long side to target_max (default 1024)
+    3) Scale the short side proportionally and align to a multiple of 64 (at least 64)
     """
     w, h = img.size
+    # 1) round each up to multiple of 64
     w1 = max(64, _round_mult64(w, mode="ceil"))
     h1 = max(64, _round_mult64(h, mode="ceil"))
+    # 2) fix long side to target_max; scale short side
     if w1 >= h1:
+        out_w = target_max
         scaled_h = h1 * (target_max / w1)
         out_h = max(64, _round_mult64(scaled_h, mode="nearest"))
     else:
         out_w = max(64, _round_mult64(scaled_w, mode="nearest"))
     return int(out_w), int(out_h)
 @spaces.GPU
+# ---------------- Preview depth for canvas (colored) ----------------
 def preview_depth(image: Optional[Image.Image], encoder: str, max_res: int, input_size: int, fp32: bool):
     if image is None:
         return None
     dm = get_model(encoder)
+    # colored visualization (RGB), consistent with your previous colormap style
     d_rgb = dm.infer(image=image, max_res=max_res, input_size=input_size, fp32=fp32, grayscale=False)
     return d_rgb
 def prepare_canvas(image, depth_img, source):
     base = depth_img if source == "depth" else image
     if base is None:
+        raise gr.Error('Please upload an image (and wait for the depth preview), then click "Prepare canvas".')
+    # Use a generic gr.update to set ImageEditor value
     return gr.update(value=base)
 # ---------------- Two-stage pipeline: depth(color) -> fill ----------------
     seed: Optional[int],
 ) -> Tuple[Image.Image, Image.Image]:
     if image is None:
+        raise gr.Error("Please upload an image first.")
+    # 1) produce a colored depth map (RGB)
     depth_model = get_model(encoder)
     depth_rgb: Image.Image = depth_model.infer(
         image=image, max_res=max_res, input_size=input_size, fp32=fp32, grayscale=False
     print(f"[DEBUG] Depth RGB: mode={depth_rgb.mode}, size={depth_rgb.size}")
+    # 2) extract mask (uploaded > drawn)
     mask_l = pick_mask(mask_upload, sketch, image, dilate_px=mask_dilate_px)
     if (mask_l is None) or (mask_l.getbbox() is None):
+        raise gr.Error("No valid mask detected: please draw on the canvas or upload a mask image.")
     print(f"[DEBUG] Mask: mode={mask_l.mode}, size={mask_l.size}, bbox={mask_l.getbbox()}")
+    # 3) decide output size
     width, height = prepare_size_for_flux(depth_rgb, target_max=max_side)
     orig_w, orig_h = image.size
     print(f"[DEBUG] FLUX size: {width}x{height}, original: {orig_w}x{orig_h}")
+    # 4) run FLUX pipeline
+    # Key fix: pass depth_rgb as `image` instead of the original image
     pipe = get_pipe()
     generator = torch.Generator("cpu").manual_seed(int(seed)) if (seed is not None and seed >= 0) else torch.Generator("cpu").manual_seed(random.randint(0, 2**31 - 1))
     result = pipe(
         prompt=prompt,
+        image=depth_rgb,           # FIX: pass the colored depth map, not the original image
         mask_image=mask_l,
         width=width,
         height=height,
         num_inference_steps=int(steps),
         max_sequence_length=512,
         generator=generator,
+        depth=depth_rgb,           # also feed depth input (colored depth)
     ).images[0]
     final_result = result.resize((orig_w, orig_h), Image.BICUBIC)
+    # return result and mask preview
     mask_preview = mask_l.resize((orig_w, orig_h), Image.NEAREST).convert("RGB")
     return final_result, mask_preview
+def _to_pil_rgb(img_like) -> Image.Image:
+    """Normalize input to PIL RGB. Supports PIL/L/RGBA/np.array."""
+    if isinstance(img_like, Image.Image):
+        return img_like.convert("RGB")
+    # numpy array -> PIL
+    try:
+        arr = np.array(img_like)
+        if arr.ndim == 2:  # grayscale
+            arr = np.stack([arr, arr, arr], axis=-1)
+        return Image.fromarray(arr.astype(np.uint8), mode="RGB")
+    except Exception:
+        raise gr.Error("Stage-2: `depth` / `depth_image` is not a valid image. Please check the provided objects.")
+# ==== STAGE-2 ONLY ADDED: Stage-2 inference (takes Stage-1 output + Stage-1 depth preview) ====
+@spaces.GPU
+def run_stage2_refine(
+    image: Image.Image,              # original image (RGB)
+    stage1_out: Image.Image,         # output from Stage-1
+    depth_img_from_stage1_input: Image.Image,  # ★ new: Stage-1 depth preview (from UI)
+    mask_upload: Optional[Image.Image],
+    sketch: Optional[dict],
+    prompt: str,
+    encoder: str,
+    max_res: int,
+    input_size: int,
+    fp32: bool,
+    max_side: int,
+    guidance_scale: float,
+    steps: int,
+    seed: Optional[int],
+) -> Image.Image:
+    if image is None or stage1_out is None:
+        raise gr.Error("Please complete Stage-1 generation first (needs original image and Stage-1 output).")
+    # allow refine without mask (use all-black -> no masked area)
+    mask_l = pick_mask(mask_upload, sketch, image, dilate_px=0)
+    if (mask_l is None) or (mask_l.getbbox() is None):
+        mask_l = Image.new("L", image.size, 0)
+    # unify sizes (based on original image)
+    width, height = prepare_size_for_flux(image, target_max=max_side)
+    orig_w, orig_h = image.size
+    pipe2 = get_pipe_stage2()
+    g2 = torch.Generator("cpu").manual_seed(int(seed)) if (seed is not None and seed >= 0) \
+        else torch.Generator("cpu").manual_seed(random.randint(0, 2**31 - 1))
+    depth_pil = _to_pil_rgb(stage1_out)                      # for `depth`
+    depth_image_pil = _to_pil_rgb(depth_img_from_stage1_input)  # for `depth_image`
+    image_rgb = _to_pil_rgb(image)                           # normalize original image to RGB
+    # resize to (width, height)
+    depth_pil = depth_pil.resize((width, height), Image.BICUBIC)
+    depth_image_pil = depth_image_pil.resize((width, height), Image.BICUBIC)
+    # ★★ Mapping:
+    #    - image        = original RGB
+    #    - depth        = Stage-1 output (treated as updated geometry)
+    #    - depth_image  = Stage-1 input depth (UI's depth preview)
+    out2 = pipe2(
+        prompt=prompt,
+        image=image,                             # ← original RGB
+        mask_image=mask_l,
+        width=width,
+        height=height,
+        guidance_scale=float(guidance_scale),
+        num_inference_steps=int(steps),
+        max_sequence_length=512,
+        generator=g2,
+        depth=depth_pil,                         # ← Stage-1 output as `depth`
+        depth_image=depth_image_pil,             # ← Stage-1 depth preview as `depth_image`
+    ).images[0]
+    out2 = out2.resize((orig_w * 3, orig_h), Image.BICUBIC)  # preserve your original ×3 display layout
+    return out2
+# ===================================================================
 # ---------------- UI ----------------
 with gr.Blocks() as demo:
+    gr.Markdown("## GeoRemover · Depth Removal (Depth (colored) → FLUX Fill)")
     with gr.Row():
         with gr.Column(scale=1):
+            # input image
             img = gr.Image(label="Upload image", type="pil")
+            # Mask: upload or draw
             with gr.Tab("Upload mask"):
                 mask_upload = gr.Image(label="Mask (optional)", type="pil")
                 sketch = gr.ImageEditor(
                     label="Sketch mask (draw with brush)",
                     type="pil",
+                    # Provide red-only brush for precise extraction of strokes
                     brush=gr.Brush(colors=["#FF0000"], default_size=24)
                 )
             # prompt
             prompt = gr.Textbox(label="Prompt", value="A beautiful scene")
+            # tunables
             with gr.Accordion("Advanced (Depth & FLUX)", open=False):
                 encoder = gr.Dropdown(["vits", "vitl"], value="vitl", label="Depth encoder")
                 max_res = gr.Slider(512, 2048, value=1280, step=64, label="Depth: max_res")
                 mask_dilate_px = gr.Slider(0, 128, value=0, step=1, label="Mask dilation (px)")
                 guidance_scale = gr.Slider(0, 50, value=30, step=0.5, label="FLUX: guidance_scale")
                 steps = gr.Slider(10, 75, value=50, step=1, label="FLUX: steps")
+                seed = gr.Number(value=0, precision=0, label="Seed (>=0 fixed; empty = random)")
             run_btn = gr.Button("Run", variant="primary")
+            # ==== STAGE-2 ONLY ADDED: add Stage-2 button ====
+            run_btn_stage2 = gr.Button("Run Stage-2 (Refine)", variant="secondary")
+            # =================================================
         with gr.Column(scale=1):
             depth_preview = gr.Image(label="Depth preview (colored)", interactive=False)
+            mask_preview = gr.Image(label="Mask preview (to be removed)", interactive=False)
             out = gr.Image(label="Output")
+            # ==== STAGE-2 ONLY ADDED: Stage-2 output ====
+            out_stage2 = gr.Image(label="Output (Stage-2 refine)")
+            # ============================================
+    # Event: when image changes, compute the colored depth preview
     img.change(
         fn=preview_depth,
         inputs=[img, encoder, max_res, input_size, fp32],
         outputs=[depth_preview],
     )
+    # Prepare canvas: put original image or colored depth image into ImageEditor
     prepare_btn.click(
         fn=prepare_canvas,
         inputs=[img, depth_preview, draw_source],
         outputs=[sketch],
     )
+    # Run Stage-1 (wiring unchanged)
     run_btn.click(
         fn=run_depth_and_fill,
         inputs=[img, mask_upload, sketch, prompt, encoder, max_res, input_size, fp32,
         api_name="run",
     )
+    # ==== STAGE-2 ONLY ADDED: run after Stage-1 has produced a result ====
+    run_btn_stage2.click(
+        fn=run_stage2_refine,
+        inputs=[img, out, depth_preview,  # ← pass depth_preview as the 3rd input to Stage-2
+                mask_upload, sketch, prompt, encoder, max_res, input_size, fp32,
+                max_side, guidance_scale, steps, seed],
+        outputs=[out_stage2],
+        api_name="run_stage2",
+    )
+    # ====================================================================
 if __name__ == "__main__":
     os.environ.setdefault("HF_HUB_DISABLE_TELEMETRY", "1")
+    demo.launch(server_name="0.0.0.0", server_port=7860)

chore: ignore pyc and __pycache__

chore: ignore pyc and pycache