Qwen-Image-Edit-2511-LoRAs-Fast

Running on Zero

App Files Files Community

cpuai commited on Jan 18

Commit

2cea403

verified ·

1 Parent(s): 1e18fce

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -131

app.py CHANGED Viewed

@@ -1,5 +1,10 @@
 import os
 import gc
 import gradio as gr
 import numpy as np
 import spaces
@@ -10,9 +15,6 @@ from typing import Iterable
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
-# =========================
-# Theme
-# =========================
 colors.orange_red = colors.Color(
     name="orange_red",
     c50="#FFF0E5",
@@ -87,11 +89,20 @@ print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
 print("torch.__version__ =", torch.__version__)
 print("Using device:", device)
 from diffusers import FlowMatchEulerDiscreteScheduler
 from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
 from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
 dtype = torch.bfloat16
 pipe = QwenImageEditPlusPipeline.from_pretrained(
@@ -110,6 +121,19 @@ try:
 except Exception as e:
     print(f"Warning: Could not set FA3 processor: {e}")
 MAX_SEED = np.iinfo(np.int32).max
 ADAPTER_SPECS = {
@@ -187,126 +211,90 @@ ADAPTER_SPECS = {
 LOADED_ADAPTERS = set()
-# =========================
-# 尺寸策略（核心修复点）
-# =========================
-def _round_to_multiple(x: int, m: int) -> int:
-    """四舍五入到最接近的 m 倍数（保证 >= m）"""
-    if m <= 0:
-        return max(1, int(x))
-    v = int(round(x / m) * m)
-    return max(m, v)
-def _floor_to_multiple(x: int, m: int) -> int:
-    """向下取整到 m 倍数（保证 >= m）"""
-    if m <= 0:
-        return max(1, int(x))
-    v = int(x // m * m)
-    return max(m, v)
-def _clamp_by_max_patches(width: int, height: int, m: int, max_patches: int) -> tuple[int, int]:
     """
-    Qwen-Image 的 latents patch 数大致等于:
-      patches = (width//m) * (height//m)
-    其中 m = vae_scale_factor*2。
-    超过 max_patches（通常 4096）就容易报错/炸显存/超模型上限。:contentReference[oaicite:2]{index=2}
     """
-    width = max(m, width)
-    height = max(m, height)
-    patches = (width // m) * (height // m)
-    if patches <= max_patches:
-        return width, height
-    # 先按面积比例缩放（保持宽高比）
-    scale = (max_patches / float(patches)) ** 0.5
-    width = int(width * scale)
-    height = int(height * scale)
-    # 缩放后向下对齐倍数，确保不会再次超过上限
-    width = _floor_to_multiple(width, m)
-    height = _floor_to_multiple(height, m)
-    # 极端情况下仍超：用简单迭代再压一点点
-    while (width // m) * (height // m) > max_patches and width > m and height > m:
-        if width >= height:
-            width = max(m, width - m)
-        else:
-            height = max(m, height - m)
     return width, height
-def compute_target_dimensions(pil_img: Image.Image, size_mode: str) -> tuple[int, int]:
     """
-    size_mode:
-      - "原图大小（不裁剪）"
-      - "原图2倍（不裁剪，超限自动缩回）"
-    注意：这里不对输入图做任何裁剪，只计算输出 width/height。
     """
-    if pil_img is None:
-        return 1024, 1024
-    ow, oh = pil_img.size
-    # m 来自 pipeline 的要求：height/width 需要能被 vae_scale_factor*2 整除（否则会被对齐/重算）:contentReference[oaicite:3]{index=3}
-    multiple_of = int(getattr(pipe, "vae_scale_factor", 8) * 2)
-    # Qwen pipeline 的 scheduler config 默认 max_image_seq_len=4096（用于 image_seq_len 上限）:contentReference[oaicite:4]{index=4}
-    max_patches = int(getattr(pipe, "scheduler", None).config.get("max_image_seq_len", 4096))
-    scale = 1
-    if size_mode.startswith("原图2倍"):
-        scale = 2
-    tw = ow * scale
-    th = oh * scale
-    # “取最接近原图大小”：优先四舍五入到最接近的合法倍数（而不是一律向下砍）
-    tw = _round_to_multiple(tw, multiple_of)
-    th = _round_to_multiple(th, multiple_of)
-    # 若 2 倍导致 patch 超限，则自动缩回（仍不裁剪，只缩放输出尺寸）
-    tw, th = _clamp_by_max_patches(tw, th, multiple_of, max_patches)
-    return tw, th
-# 仅用于 UI：上传后默认给一个“看起来合理”的预估值（不影响最终 infer 的计算）
-def update_dimensions_on_upload(image):
-    if image is None:
-        return 1024, 1024
-    # 保留你原始逻辑：最长边对齐 1024（仅做预估显示/占位）
-    original_width, original_height = image.size
-    if original_width > original_height:
-        new_width = 1024
-        aspect_ratio = original_height / original_width
-        new_height = int(new_width * aspect_ratio)
-    else:
-        new_height = 1024
-        aspect_ratio = original_width / original_height
-        new_width = int(new_height * aspect_ratio)
-    # 这里原代码是 //8*8；但真实 pipeline 里最终会按 vae_scale_factor*2 再对齐 :contentReference[oaicite:5]{index=5}
-    new_width = (new_width // 8) * 8
-    new_height = (new_height // 8) * 8
-    return new_width, new_height
 @spaces.GPU
 def infer(
     images,
     prompt,
     lora_adapter,
-    size_mode,          # ✅ 新增：目标图片大小选项
     seed,
     randomize_seed,
     guidance_scale,
     steps,
     progress=gr.Progress(track_tqdm=True)
 ):
-    gc.collect()
-    torch.cuda.empty_cache()
     if not images:
         raise gr.Error("Please upload at least one image to edit.")
@@ -315,7 +303,7 @@ def infer(
     if images is not None:
         for item in images:
             try:
-                if isinstance(item, tuple) or isinstance(item, list):
                     path_or_img = item[0]
                 else:
                     path_or_img = item
@@ -359,33 +347,52 @@ def infer(
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device=device).manual_seed(seed)
-    negative_prompt = "worst quality, low quality, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
-    # ✅ 关键：��用户选择计算输出尺寸（不裁剪）
-    width, height = compute_target_dimensions(pil_images[0], size_mode)
     try:
         result_image = pipe(
             image=pil_images,
             prompt=prompt,
             negative_prompt=negative_prompt,
-            height=height,
-            width=width,
             num_inference_steps=steps,
             generator=generator,
             true_cfg_scale=guidance_scale,
         ).images[0]
         return result_image, seed
-    except Exception as e:
         raise e
     finally:
-        gc.collect()
-        torch.cuda.empty_cache()
 @spaces.GPU
-def infer_example(images, prompt, lora_adapter, size_mode):
     if not images:
         return None, 0
@@ -398,7 +405,7 @@ def infer_example(images, prompt, lora_adapter, size_mode):
         images=images_list,
         prompt=prompt,
         lora_adapter=lora_adapter,
-        size_mode=size_mode,
         seed=0,
         randomize_seed=True,
         guidance_scale=1.0,
@@ -417,7 +424,7 @@ css = """
 with gr.Blocks() as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("# **Qwen-Image-Edit-2511-LoRAs-Fast**", elem_id="main-title")
-        gr.Markdown("Perform diverse image edits using specialized [LoRA](https://huggingface.co/models?other=base_model:adapter:Qwen/Qwen-Image-Edit-2511) adapters. Upload one or more images.")
         with gr.Row(equal_height=True):
             with gr.Column():
@@ -448,14 +455,12 @@ with gr.Blocks() as demo:
                         value="Photo-to-Anime"
                     )
-                # ✅ 新增：目标图片大小选项（解决“原图大小不裁剪”和“2倍超限报错”）
-                size_mode = gr.Radio(
-                    label="Target Size",
-                    choices=[
-                        "原图大小（不裁剪）",
-                        "原图2倍（不裁剪，超限自动缩回）",
-                    ],
-                    value="原图大小（不裁剪）",
                 )
                 with gr.Accordion("Advanced Settings", open=False, visible=False):
@@ -466,22 +471,22 @@ with gr.Blocks() as demo:
         gr.Examples(
             examples=[
-                [["examples/B.jpg"], "Transform into anime.", "Photo-to-Anime", "原图大小（不裁剪）"],
-                [["examples/HRP.jpg"], "Transform into a hyper-realistic face portrait.", "Hyper-Realistic-Portrait", "原图大小（不裁剪）"],
-                [["examples/A.jpeg"], "Rotate the camera 45 degrees to the right.", "Multiple-Angles", "原图大小（不裁剪）"],
-                [["examples/U.jpg"], "Upscale this picture to 4K resolution.", "Upscaler", "原图2倍（不裁剪，超限自动缩回）"],
-                [["examples/PP1.jpg"], "cinematic polaroid with soft grain subtle vignette gentle lighting white frame handwritten photographed by hf‪‪‬ preserving realistic texture and details", "Polaroid-Photo", "原图大小（不裁剪）"],
-                [["examples/Z1.jpg"], "Front-right quarter view.", "Fal-Multiple-Angles", "原图大小（不裁剪）"],
-                [["examples/MT.jpg"], "Paint with manga tone.", "Manga-Tone", "原图大小（不裁剪）"],
-                [["examples/URP.jpg"], "ultra-realistic portrait.", "Ultra-Realistic-Portrait", "原图大小（不裁剪）"],
-                [["examples/MN.jpg"], "Transform into Midnight Noir Eyes Spotlight.", "Midnight-Noir-Eyes-Spotlight", "原图大小（不裁剪）"],
-                [["examples/ST1.jpg", "examples/ST2.jpg"], "Convert Image 1 to the style of Image 2.", "Style-Transfer", "原图大小（不裁剪）"],
-                [["examples/R1.jpg"], "Change the picture to realistic photograph.", "Anything2Real", "原图大小（不裁剪）"],
-                [["examples/UA.jpeg"], "Unblur and upscale.", "Unblur-Anything", "原图2倍（不裁剪，超限自动缩回）"],
-                [["examples/L1.jpg", "examples/L2.jpg"], "Refer to the color tone, remove the original lighting from Image 1, and relight Image 1 based on the lighting and color tone of Image 2.", "Light-Migration", "原图大小（不裁剪）"],
-                [["examples/P1.jpg"], "Transform into anime (while preserving the background and remaining elements maintaining realism and original details.)", "Anime-V2", "原图大小（不裁剪）"],
             ],
-            inputs=[images, prompt, lora_adapter, size_mode],
             outputs=[output_image, seed],
             fn=infer_example,
             cache_examples=False,
@@ -492,7 +497,7 @@ with gr.Blocks() as demo:
     run_button.click(
         fn=infer,
-        inputs=[images, prompt, lora_adapter, size_mode, seed, randomize_seed, guidance_scale, steps],
         outputs=[output_image, seed]
     )

 import os
+# 建议：减少 CUDA 显存碎片化（对 HF Spaces 上偶发 NVML/CUDACachingAllocator 报错有帮助）
+# 该环境变量需要在 import torch 前设置才更有效。 :contentReference[oaicite:5]{index=5}
+os.environ.setdefault("PYTORCH_CUDA_ALLOC_CONF", "expandable_segments:True,max_split_size_mb:128")
 import gc
+import math
 import gradio as gr
 import numpy as np
 import spaces
 from gradio.themes import Soft
 from gradio.themes.utils import colors, fonts, sizes
 colors.orange_red = colors.Color(
     name="orange_red",
     c50="#FFF0E5",
 print("torch.__version__ =", torch.__version__)
 print("Using device:", device)
+# 可选：对推理速度有益
+try:
+    torch.backends.cuda.matmul.allow_tf32 = True
+except Exception:
+    pass
 from diffusers import FlowMatchEulerDiscreteScheduler
 from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
 from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
 from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
+# 关键：导入 module 本身，用于动态修改 VAE_IMAGE_SIZE（workaround） :contentReference[oaicite:6]{index=6}
+import qwenimage.pipeline_qwenimage_edit_plus as qwen_edit_module
 dtype = torch.bfloat16
 pipe = QwenImageEditPlusPipeline.from_pretrained(
 except Exception as e:
     print(f"Warning: Could not set FA3 processor: {e}")
+# 降显存：VAE 切片/平铺（不同 diffusers 版本方法可能存在，因此用 try 包一下） :contentReference[oaicite:7]{index=7}
+try:
+    pipe.enable_vae_slicing()
+    print("VAE slicing enabled.")
+except Exception as e:
+    print(f"Warning: enable_vae_slicing not available: {e}")
+try:
+    pipe.enable_vae_tiling()
+    print("VAE tiling enabled.")
+except Exception as e:
+    print(f"Warning: enable_vae_tiling not available: {e}")
 MAX_SEED = np.iinfo(np.int32).max
 ADAPTER_SPECS = {
 LOADED_ADAPTERS = set()
+# -----------------------------
+# 尺寸相关核心逻辑（修复“截取中间” + 防止 2 倍崩）
+# -----------------------------
+MAX_IMAGE_SEQ_LEN = 4096  # pipeline 里 calculate_shift 默认的 max_seq_len（我们用它做安全上限） :contentReference[oaicite:8]{index=8}
+def _calculate_dimensions_like_pipeline(target_area: float, ratio: float) -> tuple[int, int]:
     """
+    与 pipeline 内 calculate_dimensions 保持一致：按 32 对齐的最接近尺寸。 :contentReference[oaicite:9]{index=9}
     """
+    width = math.sqrt(target_area * ratio)
+    height = width / ratio
+    width = round(width / 32) * 32
+    height = round(height / 32) * 32
+    width = max(32, int(width))
+    height = max(32, int(height))
     return width, height
+def _pick_infer_size(pil_image: Image.Image, size_mode: str) -> tuple[int, int, int, int]:
     """
+    返回：(infer_w, infer_h, requested_w, requested_h)
+    - requested_w/h：用户期望输出尺寸（原图 / 2 倍）
+    - infer_w/h：本次实际喂给模型推理的尺寸（32 对齐 + 限制 seq_len，避免 OOM）
     """
+    ow, oh = pil_image.size
+    if size_mode == "原图的2倍":
+        req_w, req_h = ow * 2, oh * 2
+    else:
+        req_w, req_h = ow, oh
+    # QwenImage 这条 pipeline 的像素维度最终要求至少能被 (vae_scale_factor*2) 整除，否则内部会强制 reshape/截断。 :contentReference[oaicite:10]{index=10}
+    multiple_of = max(16, int(getattr(pipe, "vae_scale_factor", 8)) * 2)
+    # 以 “patch 数” 做硬上限：seq_len = (w/m)*(h/m)，超过 4096 极容易触发显存/速度问题（甚至 OOM）
+    # max_area = 4096 * m * m
+    max_area = MAX_IMAGE_SEQ_LEN * (multiple_of * multiple_of)
+    req_area = req_w * req_h
+    ratio = req_w / req_h
+    # 如果请求面积过大，先按面积等比缩小到 max_area（“取最接近且可跑通”）
+    if req_area > max_area:
+        scale = math.sqrt(max_area / req_area)
+        target_area = req_area * scale * scale
+    else:
+        target_area = req_area
+    infer_w, infer_h = _calculate_dimensions_like_pipeline(target_area, ratio)
+    # 再做一次 m 对齐（32 本身一般是 16 的倍数，但这里保险）
+    infer_w = (infer_w // multiple_of) * multiple_of
+    infer_h = (infer_h // multiple_of) * multiple_of
+    infer_w = max(multiple_of, infer_w)
+    infer_h = max(multiple_of, infer_h)
+    # 最终兜底：确保 seq_len <= 4096
+    while (infer_w // multiple_of) * (infer_h // multiple_of) > MAX_IMAGE_SEQ_LEN:
+        if infer_w >= infer_h:
+            infer_w -= multiple_of
+        else:
+            infer_h -= multiple_of
+        if infer_w < multiple_of or infer_h < multiple_of:
+            break
+    return infer_w, infer_h, req_w, req_h
+def _maybe_cuda_cleanup():
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
 @spaces.GPU
 def infer(
     images,
     prompt,
     lora_adapter,
+    target_size_mode,   # 新增：目标尺寸选项
     seed,
     randomize_seed,
     guidance_scale,
     steps,
     progress=gr.Progress(track_tqdm=True)
 ):
+    _maybe_cuda_cleanup()
     if not images:
         raise gr.Error("Please upload at least one image to edit.")
     if images is not None:
         for item in images:
             try:
+                if isinstance(item, (tuple, list)):
                     path_or_img = item[0]
                 else:
                     path_or_img = item
         seed = random.randint(0, MAX_SEED)
     generator = torch.Generator(device=device).manual_seed(seed)
+    negative_prompt = (
+        "worst quality, low quality, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, "
+        "cropped, jpeg artifacts, signature, watermark, username, blurry"
+    )
+    # 关键：计算本次推理尺寸（防止 2 倍崩），并动态设置 VAE_IMAGE_SIZE 防止“截取中间” :contentReference[oaicite:11]{index=11}
+    infer_w, infer_h, req_w, req_h = _pick_infer_size(pil_images[0], target_size_mode)
+    qwen_edit_module.VAE_IMAGE_SIZE = int(infer_w * infer_h)
+    print(f"[SizeMode={target_size_mode}] requested={req_w}x{req_h}, infer={infer_w}x{infer_h}, VAE_IMAGE_SIZE={qwen_edit_module.VAE_IMAGE_SIZE}")
     try:
         result_image = pipe(
             image=pil_images,
             prompt=prompt,
             negative_prompt=negative_prompt,
+            height=infer_h,
+            width=infer_w,
             num_inference_steps=steps,
             generator=generator,
             true_cfg_scale=guidance_scale,
         ).images[0]
+        # 如果模型推理尺寸与用户期望不同，这里用 resize 保证“原图大小 / 2 倍”的输出尺寸一致（不裁剪，只缩放）
+        # 注意：若你希望“不支持就返回最接近尺寸”，可注释掉下面这段 resize。
+        if (result_image.size[0], result_image.size[1]) != (req_w, req_h):
+            result_image = result_image.resize((req_w, req_h), Image.LANCZOS)
         return result_image, seed
+    except RuntimeError as e:
+        # 针对 HF Spaces 常见的 NVML_SUCCESS INTERNAL ASSERT FAILED（通常是 OOM/碎片化触发）给更明确的提示 :contentReference[oaicite:12]{index=12}
+        msg = str(e)
+        if "NVML_SUCCESS" in msg or "CUDACachingAllocator" in msg or "out of memory" in msg.lower():
+            _maybe_cuda_cleanup()
+            raise gr.Error(
+                "推理失败：疑似显存不足/显存碎片化（常见于 VAE decode 阶段）。"
+                "建议：降低目标尺寸（或用“原图大小”而非 2 倍）、减少 steps，或避免频繁切换/加载大量 LoRA。"
+            )
         raise e
     finally:
+        _maybe_cuda_cleanup()
 @spaces.GPU
+def infer_example(images, prompt, lora_adapter):
     if not images:
         return None, 0
         images=images_list,
         prompt=prompt,
         lora_adapter=lora_adapter,
+        target_size_mode="原图大小",  # 示例默认用原图大小
         seed=0,
         randomize_seed=True,
         guidance_scale=1.0,
 with gr.Blocks() as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("# **Qwen-Image-Edit-2511-LoRAs-Fast**", elem_id="main-title")
+        gr.Markdown("Perform diverse image edits using specialized LoRA adapters. Upload one or more images.")
         with gr.Row(equal_height=True):
             with gr.Column():
                         value="Photo-to-Anime"
                     )
+                # 新增：目标图片大小选项
+                target_size_mode = gr.Radio(
+                    label="目标图片大小",
+                    choices=["原图大小", "原图的2倍"],
+                    value="原图大小",
+                    info="如尺寸过大导致模型/显存不支持，会自动取最接近可推理尺寸；最终输出会 resize 回你选择的尺寸（不裁剪）。"
                 )
                 with gr.Accordion("Advanced Settings", open=False, visible=False):
         gr.Examples(
             examples=[
+                [["examples/B.jpg"], "Transform into anime.", "Photo-to-Anime"],
+                [["examples/HRP.jpg"], "Transform into a hyper-realistic face portrait.", "Hyper-Realistic-Portrait"],
+                [["examples/A.jpeg"], "Rotate the camera 45 degrees to the right.", "Multiple-Angles"],
+                [["examples/U.jpg"], "Upscale this picture to 4K resolution.", "Upscaler"],
+                [["examples/PP1.jpg"], "cinematic polaroid with soft grain subtle vignette gentle lighting white frame handwritten photographed by hf‪‪‬ preserving realistic texture and details", "Polaroid-Photo"],
+                [["examples/Z1.jpg"], "Front-right quarter view.", "Fal-Multiple-Angles"],
+                [["examples/MT.jpg"], "Paint with manga tone.", "Manga-Tone"],
+                [["examples/URP.jpg"], "ultra-realistic portrait.", "Ultra-Realistic-Portrait"],
+                [["examples/MN.jpg"], "Transform into Midnight Noir Eyes Spotlight.", "Midnight-Noir-Eyes-Spotlight"],
+                [["examples/ST1.jpg", "examples/ST2.jpg"], "Convert Image 1 to the style of Image 2.", "Style-Transfer"],
+                [["examples/R1.jpg"], "Change the picture to realistic photograph.", "Anything2Real"],
+                [["examples/UA.jpeg"], "Unblur and upscale.", "Unblur-Anything"],
+                [["examples/L1.jpg", "examples/L2.jpg"], "Refer to the color tone, remove the original lighting from Image 1, and relight Image 1 based on the lighting and color tone of Image 2.", "Light-Migration"],
+                [["examples/P1.jpg"], "Transform into anime (while preserving the background and remaining elements maintaining realism and original details.)", "Anime-V2"],
             ],
+            inputs=[images, prompt, lora_adapter],
             outputs=[output_image, seed],
             fn=infer_example,
             cache_examples=False,
     run_button.click(
         fn=infer,
+        inputs=[images, prompt, lora_adapter, target_size_mode, seed, randomize_seed, guidance_scale, steps],
         outputs=[output_image, seed]
     )