File size: 13,466 Bytes
ed88b1e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4b43c9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed88b1e
4b43c9c
a1fd2c6
 
ed88b1e
 
a1fd2c6
ed88b1e
 
a1fd2c6
ed88b1e
 
 
a1fd2c6
4b43c9c
 
 
 
 
 
 
 
ed88b1e
 
4b43c9c
 
 
 
ed88b1e
 
2bb358f
ed88b1e
2bb358f
 
4b43c9c
 
 
 
 
 
 
 
ed88b1e
 
 
 
 
2bb358f
 
 
 
 
 
 
 
 
 
 
 
ed88b1e
a1fd2c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed88b1e
 
 
 
 
2bb358f
ed88b1e
2bb358f
 
 
ed88b1e
 
2bb358f
 
 
 
 
4b43c9c
 
 
 
2bb358f
 
 
ed88b1e
2bb358f
 
 
 
 
 
 
 
ed88b1e
4b43c9c
 
 
 
 
 
 
 
 
 
 
 
2bb358f
 
4b43c9c
 
 
2bb358f
ed88b1e
4b43c9c
 
 
 
2bb358f
 
 
 
 
 
 
 
 
 
 
 
ed88b1e
a1fd2c6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ed88b1e
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
"""
DARKROOM HandRefiner — Hugging Face ZeroGPU Space
=================================================
Standard Gradio Interface (the pattern ZeroGPU actually supports): upload an
image, optionally paint a mask, get the hands structurally fixed on a free
on-demand GPU. This is the reliable shape — the previous "custom FastAPI route"
build failed with "No @spaces.GPU function detected" because ZeroGPU only
detects GPU functions wired into a normal Gradio app.

PIPELINE: MeshGraphormer hand-mesh -> depth map -> depth ControlNet ->
Stable Diffusion inpainting (HandRefiner). Fixes only the hand region.

--------------------------------------------------------------------------
DEPLOY  (needs a HF PRO account to CREATE a ZeroGPU Space — $9/mo)
--------------------------------------------------------------------------
1. huggingface.co -> New Space -> SDK: Gradio -> Hardware: ZeroGPU
2. Upload: app.py, requirements.txt, README.md
3. Wait for build, then use the Space UI (or call it from the DARKROOM tool
   via the gradio_client endpoint shown on the Space's "View API" page).

HONEST LIMITS:
* Creating a ZeroGPU Space requires PRO. Using one is free within a daily quota
  (resets 24h after first use); each fix is a few GPU-seconds.
* GPU duration is capped (~120s max). We request 90s.
* Stock depth ControlNet is okay-not-perfect; swap CONTROLNET_ID to
  hr16/ControlNet-HandRefiner-pruned for finetuned quality.
* MeshGraphormer can't fix unreadable hands or crossed fingers.
"""

import spaces                       # must precede torch for ZeroGPU
import torch
from PIL import Image, ImageFilter
import gradio as gr

# ---------------------------------------------------------------------------
# transformers compatibility shim (fixes MeshGraphormer import on new transformers)
# Newer transformers removed prune_linear_layer / Conv1D from transformers.modeling_utils,
# which is exactly what breaks the vendored MeshGraphormer (ComfyUI issue #578).
# Re-expose them so the legacy import succeeds.
# ---------------------------------------------------------------------------
def _patch_transformers():
    try:
        import transformers.modeling_utils as mu
        need = ("prune_linear_layer", "Conv1D", "prune_layer")
        if all(hasattr(mu, n) for n in need):
            return
        from transformers import pytorch_utils as pu
        for n in need:
            if not hasattr(mu, n) and hasattr(pu, n):
                setattr(mu, n, getattr(pu, n))
        print("[shim] transformers symbols patched", flush=True)
    except Exception as e:
        print("[shim] transformers patch skipped:", e, flush=True)
_patch_transformers()

SD_INPAINT_ID = "runwayml/stable-diffusion-inpainting"
CONTROLNET_ID = "lllyasviel/control_v11f1p_sd15_depth"
TILE_CN_ID = "lllyasviel/control_v11f1e_sd15_tile"        # detail-regeneration ControlNet
SD_BASE_ID = "runwayml/stable-diffusion-v1-5"             # base SD for img2img detail pass
MESHGRAPHORMER_ID = "hr16/ControlNet-HandRefiner-pruned"
MAX_SIDE = 768
DETAIL_MAX_SIDE = 1280   # detail pass can work larger since it's tiled-friendly
DEFAULT_PROMPT = "a detailed, anatomically correct hand with five fingers, natural proportions, same art style and lighting"
NEG = "extra fingers, fused fingers, missing fingers, deformed, mutated, blurry, low quality"
DETAIL_NEG = "blurry, soft, out of focus, jpeg artifacts, low quality, smudged, messy lines"

_PIPE = None
_MESH = None
_DETAIL = None
_MESH_OK = False
_MESH_ERR = None

def _make_mesh_detector():
    """controlnet_aux==0.0.6 ships MeshGraphormerDetector at the top level.
    (Newer versions dropped it — that's why the pin matters.)"""
    from controlnet_aux import MeshGraphormerDetector as MGD
    return MGD.from_pretrained(MESHGRAPHORMER_ID)

def _load():
    """Load SD inpaint + ControlNet (always works, diffusers-only) and attempt
    MeshGraphormer (optional). If MeshGraphormer fails, the Space still runs;
    hand auto-detect is then unavailable but manual-mask + detail pass work."""
    global _PIPE, _MESH, _MESH_OK, _MESH_ERR
    if _PIPE is not None:
        return
    import time
    from diffusers import StableDiffusionControlNetInpaintPipeline, ControlNetModel, UniPCMultistepScheduler
    t0 = time.time()
    print("[load] starting model load on CPU…", flush=True)
    # MeshGraphormer is optional — isolate it so it can't crash the container
    try:
        _MESH = _make_mesh_detector()
        _MESH_OK = True
        print(f"[load] meshgraphormer ok ({time.time()-t0:.0f}s)", flush=True)
    except Exception as e:
        _MESH = None; _MESH_OK = False; _MESH_ERR = str(e)
        print("[load] meshgraphormer UNAVAILABLE (manual mask still works):", e, flush=True)
    cn = ControlNetModel.from_pretrained(CONTROLNET_ID, torch_dtype=torch.float16)
    pipe = StableDiffusionControlNetInpaintPipeline.from_pretrained(
        SD_INPAINT_ID, controlnet=cn, torch_dtype=torch.float16, safety_checker=None
    )
    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
    try: pipe.enable_attention_slicing()
    except Exception as e: print("[load] attn-slicing skip:", e, flush=True)
    try: pipe.enable_vae_tiling()
    except Exception as e: print("[load] vae-tiling skip:", e, flush=True)
    _PIPE = pipe
    print(f"[load] pipeline ready on CPU ({time.time()-t0:.0f}s total)", flush=True)

# preload at import — runs once when the container boots, OUTSIDE any GPU-timed window
try:
    _load()
except Exception as _e:
    print("[load] preload deferred:", _e, flush=True)

def _load_detail():
    """Tile-ControlNet img2img pipeline for detail/lineart recovery. Loaded lazily on CPU."""
    global _DETAIL
    if _DETAIL is not None:
        return
    import time
    from diffusers import StableDiffusionControlNetImg2ImgPipeline, ControlNetModel, UniPCMultistepScheduler
    t0 = time.time()
    print("[load] detail pipeline (tile CN) on CPU…", flush=True)
    tile = ControlNetModel.from_pretrained(TILE_CN_ID, torch_dtype=torch.float16)
    pipe = StableDiffusionControlNetImg2ImgPipeline.from_pretrained(
        SD_BASE_ID, controlnet=tile, torch_dtype=torch.float16, safety_checker=None
    )
    pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
    try: pipe.enable_attention_slicing()
    except Exception as e: print("[load] attn-slicing skip:", e, flush=True)
    try: pipe.enable_vae_tiling()
    except Exception as e: print("[load] vae-tiling skip:", e, flush=True)
    _DETAIL = pipe
    print(f"[load] detail pipeline ready ({time.time()-t0:.0f}s)", flush=True)

def _fit_to(img, max_side):
    w, h = img.size
    s = min(1.0, max_side / max(w, h))
    return img.resize((max(8, int(round(w*s/8))*8), max(8, int(round(h*s/8))*8)), Image.LANCZOS), (w, h)

def _fit(img):
    w, h = img.size
    s = min(1.0, MAX_SIDE / max(w, h))
    return img.resize((max(8, int(round(w*s/8))*8), max(8, int(round(h*s/8))*8)), Image.LANCZOS), (w, h)

@spaces.GPU(duration=120)
def fix_hands(image, mask_layers, prompt, strength):
    """ZeroGPU-allocated worker. Models are already loaded (CPU) at import;
    here we move them onto the GPU that ZeroGPU just attached, then infer."""
    import time, traceback
    if image is None:
        raise gr.Error("Upload an image first.")
    try:
        t0 = time.time()
        _load()  # no-op if already loaded
        _MESH.to("cuda")
        _PIPE.to("cuda")
        if _MESH_OK and _MESH is not None:
            try: _MESH.to("cuda")
            except Exception: pass
        print(f"[fix] models on GPU, t={time.time()-t0:.0f}s (mesh={_MESH_OK})", flush=True)
        init, (ow, oh) = _fit(image.convert("RGB"))
        W, H = init.size
        print(f"[fix] input fitted to {W}x{H}", flush=True)

        # optional hand-drawn mask from the ImageMask component
        sent_mask = None
        if isinstance(mask_layers, dict):
            layers = mask_layers.get("layers") or []
            if layers:
                m = layers[0].convert("L").resize((W, H), Image.LANCZOS)
                if m.getbbox() is not None:
                    sent_mask = m

        depth_img = None
        auto_mask = None
        if _MESH_OK and _MESH is not None:
            print("[fix] running MeshGraphormer…", flush=True)
            try:
                mg = _MESH(init)
                depth_img, auto_mask = (mg[0], (mg[1] if len(mg) > 1 else None)) if isinstance(mg, tuple) else (mg, None)
                if depth_img is not None:
                    depth_img = depth_img.convert("RGB").resize((W, H), Image.LANCZOS)
            except Exception as e:
                print("[fix] mesh inference failed, falling back to mask:", e, flush=True)

        mask_img = sent_mask or (auto_mask.convert("L").resize((W, H), Image.LANCZOS) if auto_mask else None)
        if mask_img is None:
            if not _MESH_OK:
                raise gr.Error("Auto hand-detection isn't available on this Space build. "
                               "Paint a mask over the bad hand (use the brush on the image) and run again.")
            raise gr.Error("No hands detected. Paint a mask over the hand and try again.")

        # if we have no depth (no mesh), use the masked region of the image as a soft control
        if depth_img is None:
            depth_img = init  # tile/identity-style guidance keeps structure from the source

        mask_img = mask_img.filter(ImageFilter.GaussianBlur(2))
        print("[fix] running diffusion…", flush=True)
        out = _PIPE(
            prompt=prompt or DEFAULT_PROMPT, negative_prompt=NEG, image=init, mask_image=mask_img,
            control_image=depth_img, num_inference_steps=25, strength=float(strength),
            guidance_scale=7.5, controlnet_conditioning_scale=0.7,
        ).images[0]
        print(f"[fix] done, total {time.time()-t0:.0f}s", flush=True)
        return out.resize((ow, oh), Image.LANCZOS)
    except Exception as e:
        print("[fix] ERROR:\n" + traceback.format_exc(), flush=True)
        raise gr.Error(f"Fix failed: {e}")

@spaces.GPU(duration=120)
def detail_pass(image, strength, scale):
    """Detail/lineart recovery via Tile-ControlNet img2img at low denoise.
    Regenerates real detail and clean lines while preserving composition + style.
    No prompt is used (per ControlNet-tile guidance) so it can't redraw the subject."""
    import time, traceback
    if image is None:
        raise gr.Error("Upload an image first.")
    try:
        t0 = time.time()
        _load_detail()
        _DETAIL.to("cuda")
        src = image["background"] if isinstance(image, dict) else image
        src = src.convert("RGB")
        # optionally enlarge first (Lanczos) — the model then fills in real detail at the higher res
        scale = float(scale)
        if scale > 1.01:
            src = src.resize((int(src.width*scale), int(src.height*scale)), Image.LANCZOS)
        work, (ow, oh) = _fit_to(src, DETAIL_MAX_SIDE)
        print(f"[detail] working at {work.size}, denoise={strength}", flush=True)
        # tile controlnet uses the image itself as the control signal
        out = _DETAIL(
            prompt="", negative_prompt=DETAIL_NEG,
            image=work, control_image=work,
            num_inference_steps=30, strength=float(strength),
            guidance_scale=6.0, controlnet_conditioning_scale=1.0,
        ).images[0]
        if out.size != (ow, oh):
            out = out.resize((ow, oh), Image.LANCZOS)
        print(f"[detail] done, total {time.time()-t0:.0f}s", flush=True)
        return out
    except Exception as e:
        print("[detail] ERROR:\n" + traceback.format_exc(), flush=True)
        raise gr.Error(f"Detail pass failed: {e}")

with gr.Blocks(title="DARKROOM", theme=gr.themes.Base()) as demo:
    gr.Markdown("## 🎨 DARKROOM\nAI-art repair on GPU. **Fix hands** regenerates malformed hands "
                "with correct geometry. **Add detail** uses Tile-ControlNet img2img to recover real "
                "sharpness and clean lineart while keeping your original style.")
    with gr.Tab("Fix hands"):
        with gr.Row():
            with gr.Column():
                inp = gr.ImageMask(type="pil", label="Image (optionally paint over the bad hand)")
                prompt = gr.Textbox(value=DEFAULT_PROMPT, label="Prompt", lines=2)
                strength = gr.Slider(0.3, 1.0, value=0.75, step=0.05, label="Fix strength (denoise)")
                btn = gr.Button("Fix hands", variant="primary")
            with gr.Column():
                out = gr.Image(type="pil", label="Result")
        btn.click(fix_hands, inputs=[inp, inp, prompt, strength], outputs=out, api_name="fix_hands")
    with gr.Tab("Add detail"):
        with gr.Row():
            with gr.Column():
                dinp = gr.Image(type="pil", label="Image to sharpen / add detail")
                dstrength = gr.Slider(0.15, 0.6, value=0.3, step=0.05,
                    label="Detail strength (low = safe & on-style, high = more new detail / more drift)")
                dscale = gr.Slider(1.0, 2.0, value=1.0, step=0.5, label="Enlarge first (×)")
                dbtn = gr.Button("Add detail", variant="primary")
            with gr.Column():
                dout = gr.Image(type="pil", label="Result")
        dbtn.click(detail_pass, inputs=[dinp, dstrength, dscale], outputs=dout, api_name="detail_pass")

if __name__ == "__main__":
    demo.queue().launch()