sdfafdfsdf commited on Feb 20

Commit

674481b

verified ·

1 Parent(s): 11b66b5

Upload app code and configuration files

Browse files

Files changed (27) hide show

.gitattributes +1 -0
README.md +14 -0
app.py +1380 -0
camera_control_ui.py +589 -0
camera_control_ui.pyi +572 -0
examples/1.jpg +0 -0
examples/10.jpeg +0 -0
examples/11.jpg +0 -0
examples/12.jpg +0 -0
examples/13.jpg +3 -0
examples/14.jpg +0 -0
examples/2.jpeg +0 -0
examples/4.jpg +0 -0
examples/5.jpg +0 -0
examples/6.jpg +0 -0
examples/7.jpg +0 -0
examples/8.jpg +0 -0
examples/9.jpg +0 -0
examples/ELS.jpg +0 -0
pre-requirements.txt +1 -0
qwenimage/__init__.py +0 -0
qwenimage/pipeline_qwenimage_edit_plus.py +900 -0
qwenimage/qwen_fa3_processor.py +142 -0
qwenimage/transformer_qwenimage.py +642 -0
requirements.txt +14 -0
setup_manager.py +462 -0
start_app.sh +7 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+examples/13.jpg filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,14 @@

+---
+title: QIE-2511 Rapid-AIO LoRAs Fast (Experimental)
+emoji: ⚡
+colorFrom: red
+colorTo: yellow
+sdk: gradio
+sdk_version: 6.2.0
+app_file: app.py
+pinned: true
+license: apache-2.0
+short_description: Demo of the Collection of Qwen Image Edit LoRAs
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,1380 @@

+import sys
+try:
+    import spaces
+except ImportError:
+    class spaces:
+        @staticmethod
+        def GPU(f): return f
+sys.modules["spaces"] = sys.modules.get("spaces", spaces)
+import os
+from camera_control_ui import CameraControl3D, build_camera_prompt, update_prompt_with_camera
+import re
+import gc
+import traceback
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+import random
+from PIL import Image, ImageDraw
+from typing import Iterable, Optional
+from transformers import (
+    AutoImageProcessor,
+    AutoModelForDepthEstimation,
+)
+from huggingface_hub import hf_hub_download
+from safetensors.torch import load_file as safetensors_load_file
+from gradio.themes import Soft
+from gradio.themes.utils import colors, fonts, sizes
+# ============================================================
+# Theme
+# ============================================================
+colors.orange_red = colors.Color(
+    name="orange_red",
+    c50="#FFF0E5",
+    c100="#FFE0CC",
+    c200="#FFC299",
+    c300="#FFA366",
+    c400="#FF8533",
+    c500="#FF4500",
+    c600="#E63E00",
+    c700="#CC3700",
+    c800="#B33000",
+    c900="#992900",
+    c950="#802200",
+)
+class OrangeRedTheme(Soft):
+    def __init__(
+        self,
+        *,
+        primary_hue: colors.Color | str = colors.gray,
+        secondary_hue: colors.Color | str = colors.orange_red,
+        neutral_hue: colors.Color | str = colors.slate,
+        text_size: sizes.Size | str = sizes.text_lg,
+        font: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("Outfit"),
+            "Arial",
+            "sans-serif",
+        ),
+        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
+            fonts.GoogleFont("IBM Plex Mono"),
+            "ui-monospace",
+            "monospace",
+        ),
+    ):
+        super().__init__(
+            primary_hue=primary_hue,
+            secondary_hue=secondary_hue,
+            neutral_hue=neutral_hue,
+            text_size=text_size,
+            font=font,
+            font_mono=font_mono,
+        )
+        super().set(
+            background_fill_primary="*primary_50",
+            background_fill_primary_dark="*primary_900",
+            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
+            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
+            button_primary_text_color="white",
+            button_primary_text_color_hover="white",
+            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
+            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
+            button_secondary_text_color="black",
+            button_secondary_text_color_hover="white",
+            button_secondary_background_fill="linear-gradient(90deg, *primary_300, *primary_300)",
+            button_secondary_background_fill_hover="linear-gradient(90deg, *primary_400, *primary_400)",
+            button_secondary_background_fill_dark="linear-gradient(90deg, *primary_500, *primary_600)",
+            button_secondary_background_fill_hover_dark="linear-gradient(90deg, *primary_500, *primary_500)",
+            slider_color="*secondary_500",
+            slider_color_dark="*secondary_600",
+            block_title_text_weight="600",
+            block_border_width="3px",
+            block_shadow="*shadow_drop_lg",
+            button_primary_shadow="*shadow_drop_lg",
+            button_large_padding="11px",
+            color_accent_soft="*primary_100",
+            block_label_background_fill="*primary_200",
+        )
+orange_red_theme = OrangeRedTheme()
+# ============================================================
+# Device
+# ============================================================
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
+print("torch.__version__ =", torch.__version__)
+print("torch.version.cuda =", torch.version.cuda)
+print("cuda available:", torch.cuda.is_available())
+print("cuda device count:", torch.cuda.device_count())
+if torch.cuda.is_available():
+    print("current device:", torch.cuda.current_device())
+    print("device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
+print("Using device:", device)
+# ============================================================
+# AIO version (Space variable)
+# ============================================================
+AIO_REPO_ID = "Pr0f3ssi0n4ln00b/Phr00t-Qwen-Rapid-AIO"
+DEFAULT_AIO_VERSION = "v19"
+_VER_RE = re.compile(r"^v\d+$")
+_DIGITS_RE = re.compile(r"^\d+$")
+def _normalize_version(raw: str) -> Optional[str]:
+    if raw is None:
+        return None
+    s = str(raw).strip()
+    if not s:
+        return None
+    if _VER_RE.fullmatch(s):
+        return s
+    # forgiving: allow "21" -> "v21"
+    if _DIGITS_RE.fullmatch(s):
+        return f"v{s}"
+    return None
+_AIO_ENV_RAW = os.environ.get("AIO_VERSION", "")
+_AIO_ENV_NORM = _normalize_version(_AIO_ENV_RAW)
+AIO_VERSION = _AIO_ENV_NORM or DEFAULT_AIO_VERSION
+AIO_VERSION_SOURCE = "env" if _AIO_ENV_NORM else "default(v19)"
+print(f"AIO_VERSION (env raw) = {_AIO_ENV_RAW!r}")
+print(f"AIO_VERSION (normalized) = {_AIO_ENV_NORM!r}")
+print(f"Using AIO_VERSION = {AIO_VERSION} ({AIO_VERSION_SOURCE})")
+# ============================================================
+# Pipeline
+# ============================================================
+from diffusers import FlowMatchEulerDiscreteScheduler  # noqa: F401
+from qwenimage.pipeline_qwenimage_edit_plus import QwenImageEditPlusPipeline
+from qwenimage.transformer_qwenimage import QwenImageTransformer2DModel
+from qwenimage.qwen_fa3_processor import QwenDoubleStreamAttnProcessorFA3
+dtype = torch.bfloat16
+def _load_pipe_with_version(version: str) -> QwenImageEditPlusPipeline:
+    sub = f"{version}/transformer"
+    print(f"📦 Loading AIO transformer: {AIO_REPO_ID} / {sub}")
+    p = QwenImageEditPlusPipeline.from_pretrained(
+        "Qwen/Qwen-Image-Edit-2511",
+        transformer=QwenImageTransformer2DModel.from_pretrained(
+            AIO_REPO_ID,
+            subfolder=sub,
+            torch_dtype=dtype,
+            device_map="auto",
+            low_cpu_mem_usage=True,
+        ),
+        torch_dtype=dtype,
+    )
+    p.enable_model_cpu_offload()
+    return p
+# Forgiving load: try env/default version, fallback to v19 if it fails
+try:
+    pipe = _load_pipe_with_version(AIO_VERSION)
+except Exception as e:
+    print("❌ Failed to load requested AIO_VERSION. Falling back to v19.")
+    print("---- exception ----")
+    print(traceback.format_exc())
+    print("-------------------")
+    AIO_VERSION = DEFAULT_AIO_VERSION
+    AIO_VERSION_SOURCE = "fallback_to_v19"
+    pipe = _load_pipe_with_version(AIO_VERSION)
+# Apply FA3 Optimization
+try:
+    print("Skipping FA3 optimization for stability.")
+    print("Flash Attention 3 Processor set successfully.")
+except Exception as e:
+    print(f"Warning: Could not set FA3 processor: {e}")
+MAX_SEED = np.iinfo(np.int32).max
+# ============================================================
+# VAE tiling toggle (UI-controlled; OFF by default)
+# ============================================================
+def _apply_vae_tiling(enabled: bool):
+    """
+    Toggle VAE tiling on the global pipeline.
+    This does NOT require a Space restart; it applies to the next pipe(...) call.
+    Note: this is global process state, so concurrent users could flip it between runs.
+    """
+    try:
+        if enabled:
+            if hasattr(pipe, "enable_vae_tiling"):
+                pipe.enable_vae_tiling()
+                print("✅ VAE tiling ENABLED (per UI).")
+            elif hasattr(pipe, "vae") and hasattr(pipe.vae, "enable_tiling"):
+                pipe.vae.enable_tiling()
+                print("✅ VAE tiling ENABLED via pipe.vae.enable_tiling() (per UI).")
+            else:
+                print("⚠️ No enable_vae_tiling()/vae.enable_tiling() found; cannot enable.")
+        else:
+            if hasattr(pipe, "disable_vae_tiling"):
+                pipe.disable_vae_tiling()
+                print("🛑 VAE tiling DISABLED (per UI).")
+            elif hasattr(pipe, "vae") and hasattr(pipe.vae, "disable_tiling"):
+                pipe.vae.disable_tiling()
+                print("🛑 VAE tiling DISABLED via pipe.vae.disable_tiling() (per UI).")
+            else:
+                # If no disable method exists, we leave current state unchanged.
+                print("⚠️ No disable_vae_tiling()/vae.disable_tiling() found; leaving current state unchanged.")
+    except Exception as e:
+        print(f"⚠️ VAE tiling toggle failed: {e}")
+# ============================================================
+# Derived conditioning (Transformers): Depth
+# ============================================================
+# Depth uses Depth Anything V2 Small (Transformers-compatible):
+# https://huggingface.co/depth-anything/Depth-Anything-V2-Small-hf
+DEPTH_MODEL_ID = "depth-anything/Depth-Anything-V2-Small-hf"
+# Lazy cache keyed by device string ("cpu" / "cuda")
+_DEPTH_CACHE = {}
+def _derived_device(use_gpu: bool) -> torch.device:
+    return torch.device("cuda" if (use_gpu and torch.cuda.is_available()) else "cpu")
+def _load_depth_models(dev: torch.device):
+    key = str(dev)
+    if key in _DEPTH_CACHE:
+        return _DEPTH_CACHE[key]
+    proc = AutoImageProcessor.from_pretrained(DEPTH_MODEL_ID)
+    model = AutoModelForDepthEstimation.from_pretrained(DEPTH_MODEL_ID).to(dev)
+    model.eval()
+    _DEPTH_CACHE[key] = (proc, model)
+    return _DEPTH_CACHE[key]
+@torch.inference_mode()
+def make_depth_map(img: Image.Image, *, use_gpu: bool) -> Image.Image:
+    dev = _derived_device(use_gpu)
+    proc, model = _load_depth_models(dev)
+    w, h = img.size
+    inputs = proc(images=img.convert("RGB"), return_tensors="pt").to(dev)
+    outputs = model(**inputs)
+    predicted = outputs.predicted_depth  # [B, H, W]
+    depth = torch.nn.functional.interpolate(
+        predicted.unsqueeze(1),
+        size=(h, w),
+        mode="bicubic",
+        align_corners=False,
+    ).squeeze(1)[0]
+    depth = depth - depth.min()
+    depth = depth / (depth.max() + 1e-8)
+    depth = (depth * 255.0).clamp(0, 255).to(torch.uint8).cpu().numpy()
+    return Image.fromarray(depth).convert("RGB")
+# ============================================================
+# LoRA adapters + presets
+# ============================================================
+NONE_LORA = "None"
+ADAPTER_SPECS = {
+    "3D-Camera": {
+        "type": "single",
+        "repo": "fal/Qwen-Image-Edit-2511-Multiple-Angles-LoRA",
+        "weights": "qwen-image-edit-2511-multiple-angles-lora.safetensors",
+        "adapter_name": "angles",
+        "strength": 1.0,
+    },
+    "Qwen-lora-nsfw": {
+        "type": "single",
+        "repo": "wiikoo/Qwen-lora-nsfw",
+        "weights": "loras/qwen_image_edit_remove-clothing_v1.0.safetensors",
+        "adapter_name": "qwen-lora-nsfw",
+        "strength": 1.0,
+    },
+    "Consistance": {
+        "type": "single",
+        "repo": "Pr0f3ssi0n4ln00b/QIE_2511_Consistency_Lora",
+        "weights": "qe2511_consis_alpha_patched.safetensors",
+        "adapter_name": "Consistency",
+        "strength": 0.6,
+    },
+    "Semirealistic-photo-detailer": {
+        "type": "single",
+        "repo": "rzgar/Qwen-Image-Edit-semi-realistic-detailer",
+        "weights": "Qwen-Image-Edit-Anime-Semi-Realistic-Detailer-v1.safetensors",
+        "adapter_name": "semirealistic",
+        "strength": 1.0,
+    },
+    "AnyPose": {
+        "type": "package",
+        "requires_two_images": True,
+        "image2_label": "Upload Pose Reference (Image 2)",
+        "parts": [
+            {
+                "repo": "lilylilith/AnyPose",
+                "weights": "2511-AnyPose-base-000006250.safetensors",
+                "adapter_name": "anypose-base",
+                "strength": 0.7,
+            },
+            {
+                "repo": "lilylilith/AnyPose",
+                "weights": "2511-AnyPose-helper-00006000.safetensors",
+                "adapter_name": "anypose-helper",
+                "strength": 0.7,
+            },
+        ],
+    },
+    "Any2Real_2601": {
+        "type": "single",
+        "repo": "lrzjason/Anything2Real_2601",
+        "weights": "anything2real_2601_A_final_patched.safetensors",
+        "adapter_name": "photoreal",
+        "strength": 1.0,
+    },
+    "Hyperrealistic-Portrait": {
+        "type": "single",
+        "repo": "prithivMLmods/Qwen-Image-Edit-2511-Hyper-Realistic-Portrait",
+        "weights": "HRP_20.safetensors",
+        "adapter_name": "HRPortrait",
+        "strength": 1.0,
+    },
+    "Ultrarealistic-Portrait": {
+        "type": "single",
+        "repo": "prithivMLmods/Qwen-Image-Edit-2511-Ultra-Realistic-Portrait",
+        "weights": "URP_20.safetensors",
+        "adapter_name": "URPortrait",
+        "strength": 1.0,
+    },
+    "BFS-Best-FaceSwap": {
+        "type": "single",
+        "requires_two_images": True,
+        "image2_label": "Upload Head/Face Donor (Image 2)",
+        "repo": "Alissonerdx/BFS-Best-Face-Swap",
+        "weights": "bfs_head_v5_2511_original.safetensors",
+        "adapter_name": "BFS-Best-Faceswap",
+        "strength": 1.0,
+        "needs_alpha_fix": True,  # <-- fixes KeyError 'img_in.alpha'
+    },
+    "BFS-Best-FaceSwap-merge": {
+        "type": "single",
+        "requires_two_images": True,
+        "image2_label": "Upload Head/Face Donor (Image 2)",
+        "repo": "Alissonerdx/BFS-Best-Face-Swap",
+        "weights": "bfs_head_v5_2511_merged_version_rank_32_fp32.safetensors",
+        "adapter_name": "BFS-Best-Faceswap-merge",
+        "strength": 1.1,
+        "needs_alpha_fix": True,  # <-- fixes KeyError 'img_in.alpha'
+    },
+    "F2P": {
+        "type": "single",
+        "repo": "DiffSynth-Studio/Qwen-Image-Edit-F2P",
+        "weights": "edit_0928_lora_step40000.safetensors",
+        "adapter_name": "F2P",
+        "strength": 1.0,
+    },
+    "Multiple-Angles": {
+        "type": "single",
+        "repo": "dx8152/Qwen-Edit-2509-Multiple-angles",
+        "weights": "镜头转换.safetensors",
+        "adapter_name": "multiple-angles",
+        "strength": 1.0,
+    },
+    "Light-Restoration": {
+        "type": "single",
+        "repo": "dx8152/Qwen-Image-Edit-2509-Light_restoration",
+        "weights": "移除光影.safetensors",
+        "adapter_name": "light-restoration",
+        "strength": 1.0,
+    },
+    "Relight": {
+        "type": "single",
+        "repo": "dx8152/Qwen-Image-Edit-2509-Relight",
+        "weights": "Qwen-Edit-Relight.safetensors",
+        "adapter_name": "relight",
+        "strength": 1.0,
+    },
+    "Multi-Angle-Lighting": {
+        "type": "single",
+        "repo": "dx8152/Qwen-Edit-2509-Multi-Angle-Lighting",
+        "weights": "多角度灯光-251116.safetensors",
+        "adapter_name": "multi-angle-lighting",
+        "strength": 1.0,
+    },
+    "Edit-Skin": {
+        "type": "single",
+        "repo": "tlennon-ie/qwen-edit-skin",
+        "weights": "qwen-edit-skin_1.1_000002750.safetensors",
+        "adapter_name": "edit-skin",
+        "strength": 1.0,
+    },
+    "Next-Scene": {
+        "type": "single",
+        "repo": "lovis93/next-scene-qwen-image-lora-2509",
+        "weights": "next-scene_lora-v2-3000.safetensors",
+        "adapter_name": "next-scene",
+        "strength": 1.0,
+    },
+    "Flat-Log": {
+        "type": "single",
+        "repo": "tlennon-ie/QwenEdit2509-FlatLogColor",
+        "weights": "QwenEdit2509-FlatLogColor.safetensors",
+        "adapter_name": "flat-log",
+        "strength": 1.0,
+    },
+    "Upscale-Image": {
+        "type": "single",
+        "repo": "vafipas663/Qwen-Edit-2509-Upscale-LoRA",
+        "weights": "qwen-edit-enhance_64-v3_000001000.safetensors",
+        "adapter_name": "upscale-image",
+        "strength": 1.0,
+    },
+    "Upscale2K": {
+        "type": "single",
+        "repo": "valiantcat/Qwen-Image-Edit-2509-Upscale2K",
+        "weights": "qwen_image_edit_2509_upscale.safetensors",
+        "adapter_name": "upscale-2k",
+        "strength": 1.0,
+        "target_long_edge": 2048,
+    },
+}
+LORA_PRESET_PROMPTS = {
+    "Any2Real_2601": "change the picture 1 to realistic photograph",
+    "Semirealistic-photo-detailer": "transform the image to semi-realistic image",
+    "AnyPose": "Make the person in image 1 do the exact same pose of the person in image 2. Changing the style and background of the image of the person in image 1 is undesirable, so don't do it. The new pose should be pixel accurate to the pose we are trying to copy. The position of the arms and head and legs should be the same as the pose we are trying to copy. Change the field of view and angle to match exactly image 2. Head tilt and eye gaze pose should match the person in image 2.",
+    "Hyperrealistic-Portrait": "Transform the image into an ultra-realistic photorealistic portrait with strict identity preservation, facing straight to the camera. Enhance pore-level skin textures, realistic moisture effects, and natural wet hair clumping against the skin. Apply cool-toned soft-box lighting with subtle highlights and shadows, maintain realistic green-hazel eye catchlights without synthetic gloss, and preserve soft natural lip texture. Use shallow depth of field with a clean bokeh background, an 85mm macro photographic look, and raw photo grading without retouching to maintain realism and original details.",
+    "Ultrarealistic-Portrait": "Transform the image into an ultra-realistic glamour portrait while strictly preserving the subject’s identity. Apply a close-up composition with a slight head tilt and a hand near the face, enhance cinematic directional lighting with dramatic fashion-style highlights, and refine makeup details including glowing skin, glossy lips, luminous highlighter, and defined eyes. Increase skin realism with detailed epidermal textures such as micropores, microhairs, subtle oil sheen, natural highlights, soft wrinkles, and subsurface scattering. Maintain a luxury fashion-magazine look in a 9:16 aspect ratio, preserving realism, facial structure, and original details without over-smoothing or retouching.",
+    "Upscale2K": "Upscale this picture to 4K resolution.",
+    "BFS-Best-FaceSwap": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
+    "BFS-Best-FaceSwap-merge": "head_swap: start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
+}
+# Track what is currently loaded in memory (adapter_name values)
+LOADED_ADAPTERS = set()
+# ============================================================
+# Helpers: resolution
+# ============================================================
+# We prefer *area-based* sizing (≈ megapixels) over long-edge sizing.
+# This aligns better with Qwen-Image-Edit's internal assumptions and reduces FOV drift.
+def _round_to_multiple(x: int, m: int) -> int:
+    return max(m, (int(x) // m) * m)
+def compute_canvas_dimensions_from_area(
+    image: Image.Image,
+    target_area: int,
+    multiple_of: int,
+) -> tuple[int, int]:
+    """Compute (width, height) that matches image aspect ratio and approximates target_area.
+    The result is floored to be divisible by multiple_of (typically vae_scale_factor*2).
+    """
+    w, h = image.size
+    aspect = w / h if h else 1.0
+    # Use the pipeline's own area->(w,h) helper for consistency.
+    from qwenimage.pipeline_qwenimage_edit_plus import calculate_dimensions
+    width, height = calculate_dimensions(int(target_area), float(aspect))
+    width = _round_to_multiple(int(width), int(multiple_of))
+    height = _round_to_multiple(int(height), int(multiple_of))
+    return width, height
+def get_target_area_for_lora(
+    image: Image.Image,
+    lora_adapter: str,
+    user_target_megapixels: float,
+) -> int:
+    """Return target pixel area for the canvas.
+    Priority:
+      1) Adapter spec: target_area (pixels) or target_megapixels
+      2) Adapter spec: target_long_edge (legacy) -> converted to area using image aspect
+      3) User slider target megapixels
+    """
+    spec = ADAPTER_SPECS.get(lora_adapter, {})
+    if "target_area" in spec:
+        try:
+            return int(spec["target_area"])
+        except Exception:
+            pass
+    if "target_megapixels" in spec:
+        try:
+            mp = float(spec["target_megapixels"])
+            return int(mp * 1024 * 1024)
+        except Exception:
+            pass
+    # Legacy support (e.g. Upscale2K)
+    if "target_long_edge" in spec:
+        try:
+            long_edge = int(spec["target_long_edge"])
+            w, h = image.size
+            if w >= h:
+                new_w = long_edge
+                new_h = int(round(long_edge * (h / w)))
+            else:
+                new_h = long_edge
+                new_w = int(round(long_edge * (w / h)))
+            return int(new_w * new_h)
+        except Exception:
+            pass
+    # User default
+    try:
+        mp = float(user_target_megapixels)
+    except Exception:
+        mp = 1.0
+    # Treat 0 MP as "match input area"
+    if mp <= 0:
+        w, h = image.size
+        return int(w * h)
+    return int(mp * 1024 * 1024)
+# ============================================================
+# Helpers: multi-input routing + gallery normalization
+# ============================================================
+def lora_requires_two_images(lora_adapter: str) -> bool:
+    return bool(ADAPTER_SPECS.get(lora_adapter, {}).get("requires_two_images", False))
+def image2_label_for_lora(lora_adapter: str) -> str:
+    return str(ADAPTER_SPECS.get(lora_adapter, {}).get("image2_label", "Upload Reference (Image 2)"))
+def _to_pil_rgb(x) -> Optional[Image.Image]:
+    """
+    Accepts PIL / numpy / (image, caption) tuples from gr.Gallery and returns PIL RGB.
+    Gradio Gallery commonly yields tuples like (image, caption).
+    """
+    if x is None:
+        return None
+    # Gallery often returns (image, caption)
+    if isinstance(x, tuple) and len(x) >= 1:
+        x = x[0]
+        if x is None:
+            return None
+    if isinstance(x, Image.Image):
+        return x.convert("RGB")
+    if isinstance(x, np.ndarray):
+        return Image.fromarray(x).convert("RGB")
+    # Best-effort fallback
+    try:
+        return Image.fromarray(np.array(x)).convert("RGB")
+    except Exception:
+        return None
+def build_labeled_images(
+    img1: Image.Image,
+    img2: Optional[Image.Image],
+    extra_imgs: Optional[list[Image.Image]],
+) -> dict[str, Image.Image]:
+    """
+    Creates labels image_1, image_2, image_3... based on what is actually uploaded:
+      - img1 is always image_1
+      - img2 becomes image_2 only if present
+      - extras start immediately after the last present base box
+    The pipeline receives images in this exact order.
+    """
+    labeled: dict[str, Image.Image] = {}
+    idx = 1
+    labeled[f"image_{idx}"] = img1
+    idx += 1
+    if img2 is not None:
+        labeled[f"image_{idx}"] = img2
+        idx += 1
+    if extra_imgs:
+        for im in extra_imgs:
+            if im is None:
+                continue
+            labeled[f"image_{idx}"] = im
+            idx += 1
+    return labeled
+# ============================================================
+# Helpers: BFS alpha key fix
+# ============================================================
+def _inject_missing_alpha_keys(state_dict: dict) -> dict:
+    """
+    Diffusers' Qwen LoRA converter expects '<module>.alpha' keys.
+    BFS safetensors omits them. We inject alpha = rank (neutral scaling).
+    IMPORTANT: diffusers may strip 'diffusion_model.' before lookup, so we
+    inject BOTH:
+      - diffusion_model.xxx.alpha
+      - xxx.alpha
+    """
+    bases = {}
+    for k, v in state_dict.items():
+        if not isinstance(v, torch.Tensor):
+            continue
+        if k.endswith(".lora_down.weight") and v.ndim >= 1:
+            base = k[: -len(".lora_down.weight")]
+            rank = int(v.shape[0])
+            bases[base] = rank
+    for base, rank in bases.items():
+        alpha_tensor = torch.tensor(float(rank), dtype=torch.float32)
+        full_alpha = f"{base}.alpha"
+        if full_alpha not in state_dict:
+            state_dict[full_alpha] = alpha_tensor
+        if base.startswith("diffusion_model."):
+            stripped_base = base[len("diffusion_model.") :]
+            stripped_alpha = f"{stripped_base}.alpha"
+            if stripped_alpha not in state_dict:
+                state_dict[stripped_alpha] = alpha_tensor
+    return state_dict
+def _filter_to_diffusers_lora_keys(state_dict: dict) -> tuple[dict, dict]:
+    """Return (filtered_state_dict, stats).
+    Some ComfyUI/Qwen safetensors (especially "merged" variants) include non-LoRA
+    delta/patch keys like `*.diff` and `*.diff_b` alongside real LoRA tensors.
+    Diffusers' internal Qwen LoRA converter is strict: any leftover keys cause an
+    error (`state_dict should be empty...`).
+    This helper keeps only the keys Diffusers can consume as a LoRA:
+      - `*.lora_up.weight`
+      - `*.lora_down.weight`
+      - (rare) `*.lora_mid.weight`
+      - alpha keys: `*.alpha` (or `*.lora_alpha` which we normalize to `*.alpha`)
+    It also drops known patch keys (`*.diff`, `*.diff_b`) and everything else.
+    """
+    keep_suffixes = (
+        ".lora_up.weight",
+        ".lora_down.weight",
+        ".lora_mid.weight",
+        ".alpha",
+        ".lora_alpha",
+    )
+    dropped_patch = 0
+    dropped_other = 0
+    kept = 0
+    normalized_alpha = 0
+    out: dict[str, torch.Tensor] = {}
+    for k, v in state_dict.items():
+        if not isinstance(v, torch.Tensor):
+            # Ignore non-tensor entries if any.
+            dropped_other += 1
+            continue
+        # Drop ComfyUI "delta" keys that Diffusers' LoRA loader will never consume.
+        if k.endswith(".diff") or k.endswith(".diff_b"):
+            dropped_patch += 1
+            continue
+        if not k.endswith(keep_suffixes):
+            dropped_other += 1
+            continue
+        if k.endswith(".lora_alpha"):
+            # Normalize common alt name to what Diffusers expects.
+            base = k[: -len(".lora_alpha")]
+            k2 = f"{base}.alpha"
+            out[k2] = v.float() if v.dtype != torch.float32 else v
+            normalized_alpha += 1
+            kept += 1
+            continue
+        out[k] = v
+        kept += 1
+    stats = {
+        "kept": kept,
+        "dropped_patch": dropped_patch,
+        "dropped_other": dropped_other,
+        "normalized_alpha": normalized_alpha,
+    }
+    return out, stats
+def _duplicate_stripped_prefix_keys(state_dict: dict, prefix: str = "diffusion_model.") -> dict:
+    """Ensure both prefixed and unprefixed variants exist for LoRA-related keys.
+    Diffusers' Qwen LoRA conversion may strip `diffusion_model.` when looking up
+    modules. Some exports only include prefixed keys. To be maximally compatible,
+    we duplicate LoRA keys (and alpha) in stripped form when missing.
+    """
+    out = dict(state_dict)
+    for k, v in list(state_dict.items()):
+        if not k.startswith(prefix):
+            continue
+        stripped = k[len(prefix) :]
+        if stripped not in out:
+            out[stripped] = v
+    return out
+def _load_lora_weights_with_fallback(repo: str, weight_name: str, adapter_name: str, needs_alpha_fix: bool = False):
+    """
+    Normal path: pipe.load_lora_weights(repo, weight_name=..., adapter_name=...)
+    BFS fallback: download safetensors, inject missing alpha keys, then load from dict.
+    """
+    try:
+        pipe.load_lora_weights(repo, weight_name=weight_name, adapter_name=adapter_name)
+        return
+    except (KeyError, ValueError) as e:
+        # KeyError: missing required alpha keys (common in BFS)
+        # ValueError: Diffusers Qwen converter found leftover keys (e.g. .diff/.diff_b)
+        if not needs_alpha_fix:
+            raise
+        print(
+            "⚠️ LoRA load failed (will try safe dict fallback). "
+            f"Adapter={adapter_name!r} file={weight_name!r} error={type(e).__name__}: {e}"
+        )
+        local_path = hf_hub_download(repo_id=repo, filename=weight_name)
+        sd = safetensors_load_file(local_path)
+        # 1) Inject required `<module>.alpha` keys (neutral scaling alpha=rank).
+        sd = _inject_missing_alpha_keys(sd)
+        # 2) Keep only LoRA + alpha keys; drop ComfyUI patch/delta keys.
+        sd, stats = _filter_to_diffusers_lora_keys(sd)
+        # 3) Duplicate stripped keys (remove `diffusion_model.`) for compatibility.
+        sd = _duplicate_stripped_prefix_keys(sd)
+        print(
+            "🧹 LoRA dict cleanup stats: "
+            f"kept={stats['kept']} dropped_patch={stats['dropped_patch']} "
+            f"dropped_other={stats['dropped_other']} normalized_alpha={stats['normalized_alpha']}"
+        )
+        pipe.load_lora_weights(sd, adapter_name=adapter_name)
+        return
+# ============================================================
+# LoRA loader: single/package + strengths
+# ============================================================
+def _ensure_loaded_and_get_active_adapters(selected_lora: str):
+    spec = ADAPTER_SPECS.get(selected_lora)
+    if not spec:
+        raise gr.Error(f"Configuration not found for: {selected_lora}")
+    adapter_names = []
+    adapter_weights = []
+    if spec.get("type") == "package":
+        parts = spec.get("parts", [])
+        if not parts:
+            raise gr.Error(f"Package spec has no parts: {selected_lora}")
+        for part in parts:
+            repo = part["repo"]
+            weights = part["weights"]
+            adapter_name = part["adapter_name"]
+            strength = float(part.get("strength", 1.0))
+            needs_alpha_fix = bool(part.get("needs_alpha_fix", False))
+            if adapter_name not in LOADED_ADAPTERS:
+                print(f"--- Downloading and Loading Adapter Part: {selected_lora} / {adapter_name} ---")
+                try:
+                    _load_lora_weights_with_fallback(
+                        repo=repo,
+                        weight_name=weights,
+                        adapter_name=adapter_name,
+                        needs_alpha_fix=needs_alpha_fix,
+                    )
+                    LOADED_ADAPTERS.add(adapter_name)
+                except Exception as e:
+                    raise gr.Error(f"Failed to load adapter part {selected_lora}/{adapter_name}: {e}")
+            else:
+                print(f"--- Adapter part already loaded: {selected_lora} / {adapter_name} ---")
+            adapter_names.append(adapter_name)
+            adapter_weights.append(strength)
+    else:
+        repo = spec["repo"]
+        weights = spec["weights"]
+        adapter_name = spec["adapter_name"]
+        strength = float(spec.get("strength", 1.0))
+        needs_alpha_fix = bool(spec.get("needs_alpha_fix", False))
+        if adapter_name not in LOADED_ADAPTERS:
+            print(f"--- Downloading and Loading Adapter: {selected_lora} ---")
+            try:
+                _load_lora_weights_with_fallback(
+                    repo=repo,
+                    weight_name=weights,
+                    adapter_name=adapter_name,
+                    needs_alpha_fix=needs_alpha_fix,
+                )
+                LOADED_ADAPTERS.add(adapter_name)
+            except Exception as e:
+                raise gr.Error(f"Failed to load adapter {selected_lora}: {e}")
+        else:
+            print(f"--- Adapter {selected_lora} is already loaded. ---")
+        adapter_names = [adapter_name]
+        adapter_weights = [strength]
+    return adapter_names, adapter_weights
+# ============================================================
+# UI handlers
+# ============================================================
+def on_lora_change_ui(selected_lora, current_prompt, extras_condition_only):
+    prompt_val = current_prompt
+    if selected_lora != NONE_LORA:
+        preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
+        if preset:
+             prompt_val = preset
+        else:
+             prompt_val = "" # CLEAR THE PROMPT IF ACTIVE BUT NO PRESET
+    prompt_update = gr.update(value=prompt_val)
+    camera_update = gr.update(visible=(selected_lora == "3D-Camera"))
+    # Image2 visibility/label
+    if lora_requires_two_images(selected_lora):
+        img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))
+    else:
+        img2_update = gr.update(visible=False, value=None, label='Upload Reference (Image 2)')
+    # Extra references routing default
+    if selected_lora in ('BFS-Best-FaceSwap', 'BFS-Best-FaceSwap-merge', 'AnyPose'):
+        extras_update = gr.update(value=True)
+    else:
+        extras_update = gr.update(value=extras_condition_only)
+    return prompt_update, img2_update, extras_update, camera_update
+# ============================================================
+# UI helpers: output routing + derived conditioning
+def _append_to_gallery(existing_gallery, new_image):
+    if existing_gallery is None:
+        return [new_image]
+    if not isinstance(existing_gallery, list):
+        existing_gallery = [existing_gallery]
+    existing_gallery.append(new_image)
+    return existing_gallery
+# ============================================================
+def set_output_as_image1(last):
+    if last is None:
+        raise gr.Error("No output available yet.")
+    return gr.update(value=last)
+def set_output_as_image2(last):
+    if last is None:
+        raise gr.Error("No output available yet.")
+    return gr.update(value=last)
+def set_output_as_extra(last, existing_extra):
+    if last is None:
+        raise gr.Error("No output available yet.")
+    return _append_to_gallery(existing_extra, last)
+@spaces.GPU
+def add_derived_ref(img1, existing_extra, derived_type, derived_use_gpu):
+    if img1 is None:
+        raise gr.Error("Please upload Image 1 first.")
+    if derived_type == "None":
+        return gr.update(value=existing_extra), gr.update(visible=False, value=None)
+    base = img1.convert("RGB")
+    if derived_type == "Depth (Depth Anything V2 Small)":
+        derived = make_depth_map(base, use_gpu=bool(derived_use_gpu))
+    else:
+        raise gr.Error(f"Unknown derived type: {derived_type}")
+    new_gallery = _append_to_gallery(existing_extra, derived)
+    return gr.update(value=new_gallery), gr.update(visible=True, value=derived)
+# ============================================================
+# Inference
+# ============================================================
+@spaces.GPU
+def infer(
+    input_image_1,
+    input_image_2,
+    input_images_extra,  # gallery multi-image box
+    prompt,
+    lora_adapter,
+    seed,
+    randomize_seed,
+    guidance_scale,
+    steps,
+    target_megapixels,
+    extras_condition_only,
+    pad_to_canvas,
+    vae_tiling,  # VAE tiling toggle
+    resolution_multiple,
+    vae_ref_megapixels,
+    decoder_vae,
+    keep_decoder_2x,
+    progress=gr.Progress(track_tqdm=True),
+):
+    gc.collect()
+    if torch.cuda.is_available():
+        torch.cuda.empty_cache()
+    if input_image_1 is None:
+        raise gr.Error("Please upload Image 1.")
+    # Handle "None"
+    if lora_adapter == NONE_LORA:
+        try:
+            pipe.set_adapters([], adapter_weights=[])
+        except Exception:
+            if LOADED_ADAPTERS:
+                pipe.set_adapters(list(LOADED_ADAPTERS), adapter_weights=[0.0] * len(LOADED_ADAPTERS))
+    else:
+        adapter_names, adapter_weights = _ensure_loaded_and_get_active_adapters(lora_adapter)
+        pipe.set_adapters(adapter_names, adapter_weights=adapter_weights)
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    generator = torch.Generator(device=device).manual_seed(seed)
+    negative_prompt = (
+        "worst quality, low quality, bad anatomy, bad hands, text, error, missing fingers, "
+        "extra digit, fewer digits, cropped, jpeg artifacts, signature, watermark, username, blurry"
+    )
+    img1 = input_image_1.convert("RGB")
+    img2 = input_image_2.convert("RGB") if input_image_2 is not None else None
+    # Normalize extra images (Gallery) to PIL RGB (handles tuples from Gallery)
+    extra_imgs: list[Image.Image] = []
+    if input_images_extra:
+        for item in input_images_extra:
+            pil = _to_pil_rgb(item)
+            if pil is not None:
+                extra_imgs.append(pil)
+    # Enforce existing 2-image LoRA behavior (image_1 + image_2 required)
+    if lora_requires_two_images(lora_adapter) and img2 is None:
+        raise gr.Error("This LoRA needs two images. Please upload Image 2 as well.")
+    # Label images as image_1, image_2, image_3...
+    labeled = build_labeled_images(img1, img2, extra_imgs)
+    # Pass to pipeline in labeled order. Keep single-image call when only one is present.
+    pipe_images = list(labeled.values())
+    if len(pipe_images) == 1:
+        pipe_images = pipe_images[0]
+    # Resolution derived from Image 1 (base/body/target)
+    # Use target *area* (≈ megapixels) rather than long-edge sizing to reduce FOV drift.
+    target_area = get_target_area_for_lora(img1, lora_adapter, float(target_megapixels))
+    width, height = compute_canvas_dimensions_from_area(
+        img1,
+        target_area=target_area,
+        multiple_of=int(resolution_multiple),
+    )
+    # Decide which images participate in the VAE latent stream.
+    # If enabled, extra references beyond (Img_1, Img_2) become conditioning-only.
+    vae_image_indices = None
+    if extras_condition_only:
+        if isinstance(pipe_images, list) and len(pipe_images) > 2:
+            vae_image_indices = [0, 1] if len(pipe_images) >= 2 else [0]
+    try:
+        print(
+             "[DEBUG][infer] submitting request | "
+             f"lora_adapter={lora_adapter!r} seed={seed} prompt={prompt!r}"
+        )
+        print(f"[DEBUG][infer] canvas={width}x{height} (~{(width*height)/1_048_576:.3f} MP) vae_tiling={bool(vae_tiling)}")
+        # ✅ Apply UI toggle per-request (OFF by default)
+        # Lattice multiple passed to pipeline too (anti-drift / valid size grid)
+        res_mult = int(resolution_multiple) if resolution_multiple is not None else int(pipe.vae_scale_factor * 2)
+        # Optional: override VAE sizing for *extra* references (beyond Image 1 / Image 2)
+        # Interpreted as megapixels; 0 disables override (uses canvas).
+        try:
+            mp_ref = float(vae_ref_megapixels)
+        except Exception:
+            mp_ref = 0.0
+        vae_ref_area = int(mp_ref * 1024 * 1024) if mp_ref and mp_ref > 0 else None
+        # Extras start index depends on whether Image 2 exists
+        base_ref_count = 2 if img2 is not None else 1
+        _apply_vae_tiling(bool(vae_tiling))
+        result = pipe(
+            image=pipe_images,
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            num_inference_steps=steps,
+            generator=generator,
+            true_cfg_scale=guidance_scale,
+            vae_image_indices=vae_image_indices,
+            pad_to_canvas=bool(pad_to_canvas),
+            resolution_multiple=res_mult,
+            vae_ref_area=vae_ref_area,
+            vae_ref_start_index=base_ref_count,
+            decoder_vae=str(decoder_vae).lower(),
+            keep_decoder_2x=bool(keep_decoder_2x),
+        ).images[0]
+        return result, seed, result
+    finally:
+        gc.collect()
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+@spaces.GPU
+def infer_example(input_image, prompt, lora_adapter):
+    if input_image is None:
+        return None, 0, None
+    input_pil = input_image.convert("RGB")
+    guidance_scale = 1.0
+    steps = 4
+    # Examples don't supply Image 2 or extra images; and example list doesn't include AnyPose/BFS.
+    # Keep VAE tiling OFF in examples (matches default).
+    result, seed, last = infer(
+        input_pil,
+        None,
+        None,
+        prompt,
+        lora_adapter,
+        0,
+        True,
+        guidance_scale,
+        steps,
+        1.0,
+        True,
+        True,
+        False,  # vae_tiling
+    )
+    return result, seed, last
+# ============================================================
+# UI
+# ============================================================
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 960px;
+}
+#main-title h1 {font-size: 2.1em !important;}
+"""
+aio_status_line = (
+    f"**AIO transformer version:** `{AIO_VERSION}`  "
+    f"({AIO_VERSION_SOURCE}; env `AIO_VERSION`={_AIO_ENV_RAW!r})"
+)
+with gr.Blocks() as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown("# **Qwen-Image-Edit-2511-LoRAs-Fast**", elem_id="main-title")
+        gr.Markdown(
+            f"""This **experimental** space for [QIE-2511](https://huggingface.co/Qwen/Qwen-Image-Edit-2511) utilizes [extracted transformers](https://huggingface.co/Pr0f3ssi0n4ln00b/Phr00t-Qwen-Rapid-AIO) of [Phr00t’s Rapid AIO merge](https://huggingface.co/Phr00t/Qwen-Image-Edit-Rapid-AIO) and FA3-optimization with [LoRA](https://huggingface.co/models?other=base_model:adapter:Qwen/Qwen-Image-Edit-2511) support and a couple of extra features:
+- Optional conditioning-only routing for extra reference latents
+- Uncapped canvas resolution
+- Optional VAE tiling for high resolutions
+- Optional depth mapping for conditioning
+- Optional routing of output to input for further iterations
+- Optional alternative decoder [VAE](https://huggingface.co/spacepxl/Wan2.1-VAE-upscale2x/tree/main/diffusers/Wan2.1_VAE_upscale2x_imageonly_real_v1)
+Current environment is running **{AIO_VERSION}** of the Rapid AIO. Duplicate the space and set the **AIO_VERSION** space variable to use a different version."""
+        )
+        gr.Markdown(aio_status_line)
+        with gr.Row(equal_height=True):
+            with gr.Column():
+                input_image_1 = gr.Image(label="Upload Image 1 (Base / Target)", type="pil", )
+                input_image_2 = gr.Image(label="Upload Reference (Image 2)", type="pil", height=290, visible=False)
+                with gr.Column(visible=False) as camera_container:
+                    gr.Markdown("### 🎮 3D Camera Control\n*Drag handles: 🟢 Azimuth, 🩷 Elevation, 🟠 Distance*")
+                    camera_3d = CameraControl3D(value={"azimuth": 0, "elevation": 0, "distance": 1.0}, elem_id="camera-3d-control")
+                    gr.Markdown("### 🎚️ Slider Controls")
+                    azimuth_slider = gr.Slider(label="Azimuth", minimum=0, maximum=315, step=45, value=0, info="0°=front, 90°=right, 180°=back, 270°=left")
+                    elevation_slider = gr.Slider(label="Elevation", minimum=-30, maximum=60, step=30, value=0, info="-30°=low angle, 0°=eye, 60°=high angle")
+                    distance_slider = gr.Slider(label="Distance", minimum=0.6, maximum=1.4, step=0.4, value=1.0, info="0.6=close, 1.0=medium, 1.4=wide")
+                input_images_extra = gr.Gallery(
+                    label="Upload Additional Images (auto-indexed after Image 1/2)",
+                    type="pil",
+                    height=290,
+                    columns=4,
+                    rows=2,
+                    interactive=True,
+                )
+                prompt = gr.Text(
+                    label="Edit Prompt",
+                    show_label=True,
+                    placeholder="e.g., transform into photo..",
+                )
+                run_button = gr.Button("Edit Image", variant="primary")
+            with gr.Column():
+                output_image = gr.Image(label="Output Image", interactive=False, format="png", height=353)
+                last_output = gr.State(value=None)
+                with gr.Row():
+                    btn_out_to_img1 = gr.Button("⬅️ Output → Image 1", variant="secondary")
+                    btn_out_to_img2 = gr.Button("⬅️ Output → Image 2", variant="secondary")
+                    btn_out_to_extra = gr.Button("➕ Output → Extra Ref", variant="secondary")
+                derived_preview = gr.Image(
+                    label="Derived Conditioning Preview",
+                    interactive=False,
+                    format="png",
+                    height=200,
+                    visible=False,
+                )
+                with gr.Row():
+                    lora_choices = [NONE_LORA] + list(ADAPTER_SPECS.keys())
+                    lora_adapter = gr.Dropdown(
+                        label="Choose Editing Style",
+                        choices=lora_choices,
+                        value=NONE_LORA,
+                    )
+                with gr.Accordion("Advanced Settings", open=False, visible=True):
+                    with gr.Accordion("Derived Conditioning (Pose / Depth)", open=False):
+                        derived_type = gr.Dropdown(
+                            label="Derived Type (from Image 1)",
+                            choices=["None", "Depth (Depth Anything V2 Small)"],
+                            value="None",
+                        )
+                        derived_use_gpu = gr.Checkbox(label="Use GPU for derived model", value=False)
+                        add_derived_btn = gr.Button("➕ Add derived ref to Extras (conditioning-only recommended)")
+                    seed = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0)
+                    randomize_seed = gr.Checkbox(label="Randomize Seed", value=True)
+                    guidance_scale = gr.Slider(label="Guidance Scale", minimum=1.0, maximum=10.0, step=0.1, value=1.0)
+                    steps = gr.Slider(label="Inference Steps", minimum=1, maximum=50, step=1, value=4)
+                    target_megapixels = gr.Slider(
+                        label="Target Megapixels (canvas, 0 = match input area)",
+                        minimum=0.0,
+                        maximum=6.0,
+                        step=0.1,
+                        value=1.0,
+                    )
+                    resolution_multiple = gr.Dropdown(
+                        label="Resolution lattice multiple (anti-drift)",
+                        choices=[32, 56, 112],
+                        value=32,
+                        interactive=True,
+                    )
+                    vae_ref_megapixels = gr.Slider(
+                        label="Extra refs VAE megapixels override (0 = use canvas)",
+                        minimum=0.0,
+                        maximum=6.0,
+                        step=0.1,
+                        value=0.0,
+                    )
+                    decoder_vae = gr.Dropdown(
+                        label="Decoder VAE",
+                        choices=["qwen", "wan2x"],
+                        value="qwen",
+                        interactive=True,
+                    )
+                    keep_decoder_2x = gr.Checkbox(
+                        label="Keep 2× output (wan2x only)",
+                        value=False,
+                    )
+                    extras_condition_only = gr.Checkbox(
+                        label="Extra references are conditioning-only (exclude from VAE)",
+                        value=True,
+                    )
+                    pad_to_canvas = gr.Checkbox(
+                        label="Pad images to canvas aspect (avoid warping)",
+                        value=True,
+                    )
+                    # ✅ NEW: VAE tiling toggle (OFF by default)
+                    vae_tiling = gr.Checkbox(
+                        label="VAE tiling (lower VRAM, slower)",
+                        value=False,
+                    )
+        # On LoRA selection: preset prompt + toggle Image 2
+        lora_adapter.change(
+            fn=on_lora_change_ui,
+            inputs=[lora_adapter, prompt, extras_condition_only],
+            outputs=[prompt, input_image_2, extras_condition_only, camera_container],
+        )
+        # Examples removed automatically by setup_manager
+    # --- 3D Camera Events ---
+    def update_prompt_from_sliders(az, el, dist, curr_prompt):
+        return update_prompt_with_camera(az, el, dist, curr_prompt)
+    def sync_3d_to_sliders(cv, curr_prompt):
+        if cv and isinstance(cv, dict):
+            az = cv.get('azimuth', 0)
+            el = cv.get('elevation', 0)
+            dist = cv.get('distance', 1.0)
+            return az, el, dist, update_prompt_with_camera(az, el, dist, curr_prompt)
+        return gr.update(), gr.update(), gr.update(), gr.update()
+    def sync_sliders_to_3d(az, el, dist):
+        return {"azimuth": az, "elevation": el, "distance": dist}
+    def update_3d_image(img):
+        if img is None: return gr.update(imageUrl=None)
+        import base64
+        from io import BytesIO
+        buf = BytesIO()
+        img.save(buf, format="PNG")
+        durl = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}"
+        return gr.update(imageUrl=durl)
+    for slider in [azimuth_slider, elevation_slider, distance_slider]:
+        slider.change(fn=update_prompt_from_sliders, inputs=[azimuth_slider, elevation_slider, distance_slider, prompt], outputs=[prompt])
+        slider.release(fn=sync_sliders_to_3d, inputs=[azimuth_slider, elevation_slider, distance_slider], outputs=[camera_3d])
+    camera_3d.change(fn=sync_3d_to_sliders, inputs=[camera_3d, prompt], outputs=[azimuth_slider, elevation_slider, distance_slider, prompt])
+    input_image_1.upload(fn=update_3d_image, inputs=[input_image_1], outputs=[camera_3d])
+    input_image_1.clear(fn=lambda: gr.update(imageUrl=None), outputs=[camera_3d])
+    run_button.click(
+        fn=infer,
+        inputs=[
+            input_image_1,
+            input_image_2,
+            input_images_extra,
+            prompt,
+            lora_adapter,
+            seed,
+            randomize_seed,
+            guidance_scale,
+            steps,
+            target_megapixels,
+            extras_condition_only,
+            pad_to_canvas,
+            vae_tiling,
+            resolution_multiple,
+            vae_ref_megapixels,
+            decoder_vae,
+            keep_decoder_2x,
+        ],
+        outputs=[output_image, seed, last_output],
+    )
+    # Output routing buttons
+    btn_out_to_img1.click(fn=set_output_as_image1, inputs=[last_output], outputs=[input_image_1])
+    btn_out_to_img2.click(fn=set_output_as_image2, inputs=[last_output], outputs=[input_image_2])
+    btn_out_to_extra.click(fn=set_output_as_extra, inputs=[last_output, input_images_extra], outputs=[input_images_extra])
+    # Derived conditioning: append pose/depth map as extra ref (UI shows preview)
+    add_derived_btn.click(
+        fn=add_derived_ref,
+        inputs=[input_image_1, input_images_extra, derived_type, derived_use_gpu],
+        outputs=[input_images_extra, derived_preview],
+    )
+if __name__ == "__main__":
+    head = '<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>'
+    demo.queue(max_size=30).launch(head=head, server_name="0.0.0.0", share=True,
+        css=css,
+        theme=orange_red_theme,
+        mcp_server=True,
+        ssr_mode=False,
+        show_error=True,
+    )
+# Manual Patch for missing prompts
+try:
+    LORA_PRESET_PROMPTS.update({
+        "Consistance": "improve consistency and quality of the generated image",
+        "F2P": "transform the image into a high-quality photo with realistic details",
+        "Multiple-Angles": "change the camera angle of the image",
+        "Light-Restoration": "Remove shadows and relight the image using soft lighting",
+        "Relight": "Relight the image with cinematic lighting",
+        "Multi-Angle-Lighting": "Change the lighting direction and intensity",
+        "Edit-Skin": "Enhance skin textures and natural details",
+        "Next-Scene": "Generate the next scene based on the current image",
+        "Flat-Log": "Desaturate and lower contrast for a flat log look",
+        "Upscale-Image": "Enhance and sharpen the image details",
+        "BFS-Best-FaceSwap": "head_swap : start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure, mouth, lips and front head of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
+        "BFS-Best-FaceSwap-merge": "head_swap : start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure, mouth, lips and front head of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
+        "Qwen-lora-nsfw": "Convert this picture to artistic style.",
+    })
+except NameError:
+    pass

camera_control_ui.py ADDED Viewed

	@@ -0,0 +1,589 @@

+import gradio as gr
+# Azimuth mappings (8 positions)
+AZIMUTH_MAP = {
+    0: "front view",
+    45: "front-right quarter view",
+    90: "right side view",
+    135: "back-right quarter view",
+    180: "back view",
+    225: "back-left quarter view",
+    270: "left side view",
+    315: "front-left quarter view"
+}
+# Elevation mappings (4 positions)
+ELEVATION_MAP = {
+    -30: "low-angle shot",
+    0: "eye-level shot",
+    30: "elevated shot",
+    60: "high-angle shot"
+}
+# Distance mappings (3 positions)
+DISTANCE_MAP = {
+    0.6: "close-up",
+    1.0: "medium shot",
+    1.8: "wide shot"
+}
+def snap_to_nearest(value, options):
+    """Snap a value to the nearest option in a list."""
+    return min(options, key=lambda x: abs(x - value))
+def build_camera_prompt(azimuth: float, elevation: float, distance: float) -> str:
+    """
+    Build a camera prompt from azimuth, elevation, and distance values.
+    Args:
+        azimuth: Horizontal rotation in degrees (0-360)
+        elevation: Vertical angle in degrees (-30 to 60)
+        distance: Distance factor (0.6 to 1.8)
+    Returns:
+        Formatted prompt string for the LoRA
+    """
+    # Snap to nearest valid values
+    azimuth_snapped = snap_to_nearest(azimuth, list(AZIMUTH_MAP.keys()))
+    elevation_snapped = snap_to_nearest(elevation, list(ELEVATION_MAP.keys()))
+    distance_snapped = snap_to_nearest(distance, list(DISTANCE_MAP.keys()))
+    azimuth_name = AZIMUTH_MAP[azimuth_snapped]
+    elevation_name = ELEVATION_MAP[elevation_snapped]
+    distance_name = DISTANCE_MAP[distance_snapped]
+    return f"<sks> {azimuth_name} {elevation_name} {distance_name}"
+def update_prompt_with_camera(azimuth: float, elevation: float, distance: float, current_prompt: str) -> str:
+    """
+    Updates the existing prompt by replacing or appending the camera trigger words.
+    """
+    import re
+    camera_str = build_camera_prompt(azimuth, elevation, distance)
+    if not current_prompt:
+        return camera_str
+    # Remove any existing <sks> ... shot tags
+    # The pattern matches <sks> followed by any characters until the word "shot"
+    clean_prompt = re.sub(r"<sks>.*?shot(?!.*shot)", "", current_prompt).strip()
+    # Clean up multiple spaces
+    clean_prompt = re.sub(r"\s+", " ", clean_prompt)
+    if clean_prompt:
+        return f"{clean_prompt} {camera_str}"
+    return camera_str
+# --- 3D Camera Control Component ---
+class CameraControl3D(gr.HTML):
+    """
+    A 3D camera control component using Three.js.
+    Outputs: { azimuth: number, elevation: number, distance: number }
+    Accepts imageUrl prop to display user's uploaded image on the plane.
+    """
+    def __init__(self, value=None, imageUrl=None, **kwargs):
+        if value is None:
+            value = {"azimuth": 0, "elevation": 0, "distance": 1.0}
+        html_template = """
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
+        <div id="camera-control-wrapper" style="width: 100%; height: 450px; position: relative; background: #1a1a1a; border-radius: 12px; overflow: hidden;">
+            <div id="prompt-overlay" style="position: absolute; bottom: 10px; left: 50%; transform: translateX(-50%); background: rgba(0,0,0,0.8); padding: 8px 16px; border-radius: 8px; font-family: monospace; font-size: 12px; color: #00ff88; white-space: nowrap; z-index: 10;"></div>
+        </div>
+        """
+        js_on_load = """
+        (() => {
+            const wrapper = element.querySelector('#camera-control-wrapper');
+            const promptOverlay = element.querySelector('#prompt-overlay');
+            // Wait for THREE to load
+            const initScene = () => {
+                if (typeof THREE === 'undefined') {
+                    setTimeout(initScene, 100);
+                    return;
+                }
+                // Scene setup
+                const scene = new THREE.Scene();
+                scene.background = new THREE.Color(0x1a1a1a);
+                const camera = new THREE.PerspectiveCamera(50, wrapper.clientWidth / wrapper.clientHeight, 0.1, 1000);
+                camera.position.set(4.5, 3, 4.5);
+                camera.lookAt(0, 0.75, 0);
+                const renderer = new THREE.WebGLRenderer({ antialias: true });
+                renderer.setSize(wrapper.clientWidth, wrapper.clientHeight);
+                renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
+                wrapper.insertBefore(renderer.domElement, promptOverlay);
+                // Lighting
+                scene.add(new THREE.AmbientLight(0xffffff, 0.6));
+                const dirLight = new THREE.DirectionalLight(0xffffff, 0.6);
+                dirLight.position.set(5, 10, 5);
+                scene.add(dirLight);
+                // Grid
+                scene.add(new THREE.GridHelper(8, 16, 0x333333, 0x222222));
+                // Constants - reduced distances for tighter framing
+                const CENTER = new THREE.Vector3(0, 0.75, 0);
+                const BASE_DISTANCE = 1.6;
+                const AZIMUTH_RADIUS = 2.4;
+                const ELEVATION_RADIUS = 1.8;
+                // State
+                let azimuthAngle = props.value?.azimuth || 0;
+                let elevationAngle = props.value?.elevation || 0;
+                let distanceFactor = props.value?.distance || 1.0;
+                // Mappings - reduced wide shot multiplier
+                const azimuthSteps = [0, 45, 90, 135, 180, 225, 270, 315];
+                const elevationSteps = [-30, 0, 30, 60];
+                const distanceSteps = [0.6, 1.0, 1.4];
+                const azimuthNames = {
+                    0: 'front view', 45: 'front-right quarter view', 90: 'right side view',
+                    135: 'back-right quarter view', 180: 'back view', 225: 'back-left quarter view',
+                    270: 'left side view', 315: 'front-left quarter view'
+                };
+                const elevationNames = { '-30': 'low-angle shot', '0': 'eye-level shot', '30': 'elevated shot', '60': 'high-angle shot' };
+                const distanceNames = { '0.6': 'close-up', '1': 'medium shot', '1.4': 'wide shot' };
+                function snapToNearest(value, steps) {
+                    return steps.reduce((prev, curr) => Math.abs(curr - value) < Math.abs(prev - value) ? curr : prev);
+                }
+                // Create placeholder texture (smiley face)
+                function createPlaceholderTexture() {
+                    const canvas = document.createElement('canvas');
+                    canvas.width = 256;
+                    canvas.height = 256;
+                    const ctx = canvas.getContext('2d');
+                    ctx.fillStyle = '#3a3a4a';
+                    ctx.fillRect(0, 0, 256, 256);
+                    ctx.fillStyle = '#ffcc99';
+                    ctx.beginPath();
+                    ctx.arc(128, 128, 80, 0, Math.PI * 2);
+                    ctx.fill();
+                    ctx.fillStyle = '#333';
+                    ctx.beginPath();
+                    ctx.arc(100, 110, 10, 0, Math.PI * 2);
+                    ctx.arc(156, 110, 10, 0, Math.PI * 2);
+                    ctx.fill();
+                    ctx.strokeStyle = '#333';
+                    ctx.lineWidth = 3;
+                    ctx.beginPath();
+                    ctx.arc(128, 130, 35, 0.2, Math.PI - 0.2);
+                    ctx.stroke();
+                    return new THREE.CanvasTexture(canvas);
+                }
+                // Target image plane
+                let currentTexture = createPlaceholderTexture();
+                const planeMaterial = new THREE.MeshBasicMaterial({ map: currentTexture, side: THREE.DoubleSide });
+                let targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(1.2, 1.2), planeMaterial);
+                targetPlane.position.copy(CENTER);
+                scene.add(targetPlane);
+                // Function to update texture from image URL
+                function updateTextureFromUrl(url) {
+                    if (!url) {
+                        // Reset to placeholder
+                        planeMaterial.map = createPlaceholderTexture();
+                        planeMaterial.needsUpdate = true;
+                        // Reset plane to square
+                        scene.remove(targetPlane);
+                        targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(1.2, 1.2), planeMaterial);
+                        targetPlane.position.copy(CENTER);
+                        scene.add(targetPlane);
+                        return;
+                    }
+                    const loader = new THREE.TextureLoader();
+                    loader.crossOrigin = 'anonymous';
+                    loader.load(url, (texture) => {
+                        texture.minFilter = THREE.LinearFilter;
+                        texture.magFilter = THREE.LinearFilter;
+                        planeMaterial.map = texture;
+                        planeMaterial.needsUpdate = true;
+                        // Adjust plane aspect ratio to match image
+                        const img = texture.image;
+                        if (img && img.width && img.height) {
+                            const aspect = img.width / img.height;
+                            const maxSize = 1.5;
+                            let planeWidth, planeHeight;
+                            if (aspect > 1) {
+                                planeWidth = maxSize;
+                                planeHeight = maxSize / aspect;
+                            } else {
+                                planeHeight = maxSize;
+                                planeWidth = maxSize * aspect;
+                            }
+                            scene.remove(targetPlane);
+                            targetPlane = new THREE.Mesh(
+                                new THREE.PlaneGeometry(planeWidth, planeHeight),
+                                planeMaterial
+                            );
+                            targetPlane.position.copy(CENTER);
+                            scene.add(targetPlane);
+                        }
+                    }, undefined, (err) => {
+                        console.error('Failed to load texture:', err);
+                    });
+                }
+                // Check for initial imageUrl
+                if (props.imageUrl) {
+                    updateTextureFromUrl(props.imageUrl);
+                }
+                // Camera model
+                const cameraGroup = new THREE.Group();
+                const bodyMat = new THREE.MeshStandardMaterial({ color: 0x6699cc, metalness: 0.5, roughness: 0.3 });
+                const body = new THREE.Mesh(new THREE.BoxGeometry(0.3, 0.22, 0.38), bodyMat);
+                cameraGroup.add(body);
+                const lens = new THREE.Mesh(
+                    new THREE.CylinderGeometry(0.09, 0.11, 0.18, 16),
+                    new THREE.MeshStandardMaterial({ color: 0x6699cc, metalness: 0.5, roughness: 0.3 })
+                );
+                lens.rotation.x = Math.PI / 2;
+                lens.position.z = 0.26;
+                cameraGroup.add(lens);
+                scene.add(cameraGroup);
+                // GREEN: Azimuth ring
+                const azimuthRing = new THREE.Mesh(
+                    new THREE.TorusGeometry(AZIMUTH_RADIUS, 0.04, 16, 64),
+                    new THREE.MeshStandardMaterial({ color: 0x00ff88, emissive: 0x00ff88, emissiveIntensity: 0.3 })
+                );
+                azimuthRing.rotation.x = Math.PI / 2;
+                azimuthRing.position.y = 0.05;
+                scene.add(azimuthRing);
+                const azimuthHandle = new THREE.Mesh(
+                    new THREE.SphereGeometry(0.18, 16, 16),
+                    new THREE.MeshStandardMaterial({ color: 0x00ff88, emissive: 0x00ff88, emissiveIntensity: 0.5 })
+                );
+                azimuthHandle.userData.type = 'azimuth';
+                scene.add(azimuthHandle);
+                // PINK: Elevation arc
+                const arcPoints = [];
+                for (let i = 0; i <= 32; i++) {
+                    const angle = THREE.MathUtils.degToRad(-30 + (90 * i / 32));
+                    arcPoints.push(new THREE.Vector3(-0.8, ELEVATION_RADIUS * Math.sin(angle) + CENTER.y, ELEVATION_RADIUS * Math.cos(angle)));
+                }
+                const arcCurve = new THREE.CatmullRomCurve3(arcPoints);
+                const elevationArc = new THREE.Mesh(
+                    new THREE.TubeGeometry(arcCurve, 32, 0.04, 8, false),
+                    new THREE.MeshStandardMaterial({ color: 0xff69b4, emissive: 0xff69b4, emissiveIntensity: 0.3 })
+                );
+                scene.add(elevationArc);
+                const elevationHandle = new THREE.Mesh(
+                    new THREE.SphereGeometry(0.18, 16, 16),
+                    new THREE.MeshStandardMaterial({ color: 0xff69b4, emissive: 0xff69b4, emissiveIntensity: 0.5 })
+                );
+                elevationHandle.userData.type = 'elevation';
+                scene.add(elevationHandle);
+                // ORANGE: Distance line & handle
+                const distanceLineGeo = new THREE.BufferGeometry();
+                const distanceLine = new THREE.Line(distanceLineGeo, new THREE.LineBasicMaterial({ color: 0xffa500 }));
+                scene.add(distanceLine);
+                const distanceHandle = new THREE.Mesh(
+                    new THREE.SphereGeometry(0.18, 16, 16),
+                    new THREE.MeshStandardMaterial({ color: 0xffa500, emissive: 0xffa500, emissiveIntensity: 0.5 })
+                );
+                distanceHandle.userData.type = 'distance';
+                scene.add(distanceHandle);
+                function updatePositions() {
+                    const distance = BASE_DISTANCE * distanceFactor;
+                    const azRad = THREE.MathUtils.degToRad(azimuthAngle);
+                    const elRad = THREE.MathUtils.degToRad(elevationAngle);
+                    const camX = distance * Math.sin(azRad) * Math.cos(elRad);
+                    const camY = distance * Math.sin(elRad) + CENTER.y;
+                    const camZ = distance * Math.cos(azRad) * Math.cos(elRad);
+                    cameraGroup.position.set(camX, camY, camZ);
+                    cameraGroup.lookAt(CENTER);
+                    azimuthHandle.position.set(AZIMUTH_RADIUS * Math.sin(azRad), 0.05, AZIMUTH_RADIUS * Math.cos(azRad));
+                    elevationHandle.position.set(-0.8, ELEVATION_RADIUS * Math.sin(elRad) + CENTER.y, ELEVATION_RADIUS * Math.cos(elRad));
+                    const orangeDist = distance - 0.5;
+                    distanceHandle.position.set(
+                        orangeDist * Math.sin(azRad) * Math.cos(elRad),
+                        orangeDist * Math.sin(elRad) + CENTER.y,
+                        orangeDist * Math.cos(azRad) * Math.cos(elRad)
+                    );
+                    distanceLineGeo.setFromPoints([cameraGroup.position.clone(), CENTER.clone()]);
+                    // Update prompt
+                    const azSnap = snapToNearest(azimuthAngle, azimuthSteps);
+                    const elSnap = snapToNearest(elevationAngle, elevationSteps);
+                    const distSnap = snapToNearest(distanceFactor, distanceSteps);
+                    const distKey = distSnap === 1 ? '1' : distSnap.toFixed(1);
+                    const prompt = '<sks> ' + azimuthNames[azSnap] + ' ' + elevationNames[String(elSnap)] + ' ' + distanceNames[distKey];
+                    promptOverlay.textContent = prompt;
+                }
+                function updatePropsAndTrigger() {
+                    const azSnap = snapToNearest(azimuthAngle, azimuthSteps);
+                    const elSnap = snapToNearest(elevationAngle, elevationSteps);
+                    const distSnap = snapToNearest(distanceFactor, distanceSteps);
+                    props.value = { azimuth: azSnap, elevation: elSnap, distance: distSnap };
+                    trigger('change', props.value);
+                }
+                // Raycasting
+                const raycaster = new THREE.Raycaster();
+                const mouse = new THREE.Vector2();
+                let isDragging = false;
+                let dragTarget = null;
+                let dragStartMouse = new THREE.Vector2();
+                let dragStartDistance = 1.0;
+                const intersection = new THREE.Vector3();
+                const canvas = renderer.domElement;
+                canvas.addEventListener('mousedown', (e) => {
+                    const rect = canvas.getBoundingClientRect();
+                    mouse.x = ((e.clientX - rect.left) / rect.width) * 2 - 1;
+                    mouse.y = -((e.clientY - rect.top) / rect.height) * 2 + 1;
+                    raycaster.setFromCamera(mouse, camera);
+                    const intersects = raycaster.intersectObjects([azimuthHandle, elevationHandle, distanceHandle]);
+                    if (intersects.length > 0) {
+                        isDragging = true;
+                        dragTarget = intersects[0].object;
+                        dragTarget.material.emissiveIntensity = 1.0;
+                        dragTarget.scale.setScalar(1.3);
+                        dragStartMouse.copy(mouse);
+                        dragStartDistance = distanceFactor;
+                        canvas.style.cursor = 'grabbing';
+                    }
+                });
+                canvas.addEventListener('mousemove', (e) => {
+                    const rect = canvas.getBoundingClientRect();
+                    mouse.x = ((e.clientX - rect.left) / rect.width) * 2 - 1;
+                    mouse.y = -((e.clientY - rect.top) / rect.height) * 2 + 1;
+                    if (isDragging && dragTarget) {
+                        raycaster.setFromCamera(mouse, camera);
+                        if (dragTarget.userData.type === 'azimuth') {
+                            const plane = new THREE.Plane(new THREE.Vector3(0, 1, 0), -0.05);
+                            if (raycaster.ray.intersectPlane(plane, intersection)) {
+                                azimuthAngle = THREE.MathUtils.radToDeg(Math.atan2(intersection.x, intersection.z));
+                                if (azimuthAngle < 0) azimuthAngle += 360;
+                            }
+                        } else if (dragTarget.userData.type === 'elevation') {
+                            const plane = new THREE.Plane(new THREE.Vector3(1, 0, 0), -0.8);
+                            if (raycaster.ray.intersectPlane(plane, intersection)) {
+                                const relY = intersection.y - CENTER.y;
+                                const relZ = intersection.z;
+                                elevationAngle = THREE.MathUtils.clamp(THREE.MathUtils.radToDeg(Math.atan2(relY, relZ)), -30, 60);
+                            }
+                        } else if (dragTarget.userData.type === 'distance') {
+                            const deltaY = mouse.y - dragStartMouse.y;
+                            distanceFactor = THREE.MathUtils.clamp(dragStartDistance - deltaY * 1.5, 0.6, 1.4);
+                        }
+                        updatePositions();
+                    } else {
+                        raycaster.setFromCamera(mouse, camera);
+                        const intersects = raycaster.intersectObjects([azimuthHandle, elevationHandle, distanceHandle]);
+                        [azimuthHandle, elevationHandle, distanceHandle].forEach(h => {
+                            h.material.emissiveIntensity = 0.5;
+                            h.scale.setScalar(1);
+                        });
+                        if (intersects.length > 0) {
+                            intersects[0].object.material.emissiveIntensity = 0.8;
+                            intersects[0].object.scale.setScalar(1.1);
+                            canvas.style.cursor = 'grab';
+                        } else {
+                            canvas.style.cursor = 'default';
+                        }
+                    }
+                });
+                const onMouseUp = () => {
+                    if (dragTarget) {
+                        dragTarget.material.emissiveIntensity = 0.5;
+                        dragTarget.scale.setScalar(1);
+                        // Snap and animate
+                        const targetAz = snapToNearest(azimuthAngle, azimuthSteps);
+                        const targetEl = snapToNearest(elevationAngle, elevationSteps);
+                        const targetDist = snapToNearest(distanceFactor, distanceSteps);
+                        const startAz = azimuthAngle, startEl = elevationAngle, startDist = distanceFactor;
+                        const startTime = Date.now();
+                        function animateSnap() {
+                            const t = Math.min((Date.now() - startTime) / 200, 1);
+                            const ease = 1 - Math.pow(1 - t, 3);
+                            let azDiff = targetAz - startAz;
+                            if (azDiff > 180) azDiff -= 360;
+                            if (azDiff < -180) azDiff += 360;
+                            azimuthAngle = startAz + azDiff * ease;
+                            if (azimuthAngle < 0) azimuthAngle += 360;
+                            if (azimuthAngle >= 360) azimuthAngle -= 360;
+                            elevationAngle = startEl + (targetEl - startEl) * ease;
+                            distanceFactor = startDist + (targetDist - startDist) * ease;
+                            updatePositions();
+                            if (t < 1) requestAnimationFrame(animateSnap);
+                            else updatePropsAndTrigger();
+                        }
+                        animateSnap();
+                    }
+                    isDragging = false;
+                    dragTarget = null;
+                    canvas.style.cursor = 'default';
+                };
+                canvas.addEventListener('mouseup', onMouseUp);
+                canvas.addEventListener('mouseleave', onMouseUp);
+                // Touch support for mobile
+                canvas.addEventListener('touchstart', (e) => {
+                    e.preventDefault();
+                    const touch = e.touches[0];
+                    const rect = canvas.getBoundingClientRect();
+                    mouse.x = ((touch.clientX - rect.left) / rect.width) * 2 - 1;
+                    mouse.y = -((touch.clientY - rect.top) / rect.height) * 2 + 1;
+                    raycaster.setFromCamera(mouse, camera);
+                    const intersects = raycaster.intersectObjects([azimuthHandle, elevationHandle, distanceHandle]);
+                    if (intersects.length > 0) {
+                        isDragging = true;
+                        dragTarget = intersects[0].object;
+                        dragTarget.material.emissiveIntensity = 1.0;
+                        dragTarget.scale.setScalar(1.3);
+                        dragStartMouse.copy(mouse);
+                        dragStartDistance = distanceFactor;
+                    }
+                }, { passive: false });
+                canvas.addEventListener('touchmove', (e) => {
+                    e.preventDefault();
+                    const touch = e.touches[0];
+                    const rect = canvas.getBoundingClientRect();
+                    mouse.x = ((touch.clientX - rect.left) / rect.width) * 2 - 1;
+                    mouse.y = -((touch.clientY - rect.top) / rect.height) * 2 + 1;
+                    if (isDragging && dragTarget) {
+                        raycaster.setFromCamera(mouse, camera);
+                        if (dragTarget.userData.type === 'azimuth') {
+                            const plane = new THREE.Plane(new THREE.Vector3(0, 1, 0), -0.05);
+                            if (raycaster.ray.intersectPlane(plane, intersection)) {
+                                azimuthAngle = THREE.MathUtils.radToDeg(Math.atan2(intersection.x, intersection.z));
+                                if (azimuthAngle < 0) azimuthAngle += 360;
+                            }
+                        } else if (dragTarget.userData.type === 'elevation') {
+                            const plane = new THREE.Plane(new THREE.Vector3(1, 0, 0), -0.8);
+                            if (raycaster.ray.intersectPlane(plane, intersection)) {
+                                const relY = intersection.y - CENTER.y;
+                                const relZ = intersection.z;
+                                elevationAngle = THREE.MathUtils.clamp(THREE.MathUtils.radToDeg(Math.atan2(relY, relZ)), -30, 60);
+                            }
+                        } else if (dragTarget.userData.type === 'distance') {
+                            const deltaY = mouse.y - dragStartMouse.y;
+                            distanceFactor = THREE.MathUtils.clamp(dragStartDistance - deltaY * 1.5, 0.6, 1.4);
+                        }
+                        updatePositions();
+                    }
+                }, { passive: false });
+                canvas.addEventListener('touchend', (e) => {
+                    e.preventDefault();
+                    onMouseUp();
+                }, { passive: false });
+                canvas.addEventListener('touchcancel', (e) => {
+                    e.preventDefault();
+                    onMouseUp();
+                }, { passive: false });
+                // Initial update
+                updatePositions();
+                // Render loop
+                function render() {
+                    requestAnimationFrame(render);
+                    renderer.render(scene, camera);
+                }
+                render();
+                // Handle resize
+                new ResizeObserver(() => {
+                    camera.aspect = wrapper.clientWidth / wrapper.clientHeight;
+                    camera.updateProjectionMatrix();
+                    renderer.setSize(wrapper.clientWidth, wrapper.clientHeight);
+                }).observe(wrapper);
+                // Store update functions for external calls
+                wrapper._updateFromProps = (newVal) => {
+                    if (newVal && typeof newVal === 'object') {
+                        azimuthAngle = newVal.azimuth ?? azimuthAngle;
+                        elevationAngle = newVal.elevation ?? elevationAngle;
+                        distanceFactor = newVal.distance ?? distanceFactor;
+                        updatePositions();
+                    }
+                };
+                wrapper._updateTexture = updateTextureFromUrl;
+                // Watch for prop changes (imageUrl and value)
+                let lastImageUrl = props.imageUrl;
+                let lastValue = JSON.stringify(props.value);
+                setInterval(() => {
+                    // Check imageUrl changes
+                    if (props.imageUrl !== lastImageUrl) {
+                        lastImageUrl = props.imageUrl;
+                        updateTextureFromUrl(props.imageUrl);
+                    }
+                    // Check value changes (from sliders)
+                    const currentValue = JSON.stringify(props.value);
+                    if (currentValue !== lastValue) {
+                        lastValue = currentValue;
+                        if (props.value && typeof props.value === 'object') {
+                            azimuthAngle = props.value.azimuth ?? azimuthAngle;
+                            elevationAngle = props.value.elevation ?? elevationAngle;
+                            distanceFactor = props.value.distance ?? distanceFactor;
+                            updatePositions();
+                        }
+                    }
+                }, 100);
+            };
+            initScene();
+        })();
+        """
+        super().__init__(
+            value=value,
+            html_template=html_template,
+            js_on_load=js_on_load,
+            imageUrl=imageUrl,
+            **kwargs
+        )

camera_control_ui.pyi ADDED Viewed

	@@ -0,0 +1,572 @@

+import gradio as gr
+# Azimuth mappings (8 positions)
+AZIMUTH_MAP = {
+    0: "front view",
+    45: "front-right quarter view",
+    90: "right side view",
+    135: "back-right quarter view",
+    180: "back view",
+    225: "back-left quarter view",
+    270: "left side view",
+    315: "front-left quarter view"
+}
+# Elevation mappings (4 positions)
+ELEVATION_MAP = {
+    -30: "low-angle shot",
+    0: "eye-level shot",
+    30: "elevated shot",
+    60: "high-angle shot"
+}
+# Distance mappings (3 positions)
+DISTANCE_MAP = {
+    0.6: "close-up",
+    1.0: "medium shot",
+    1.8: "wide shot"
+}
+def snap_to_nearest(value, options):
+    """Snap a value to the nearest option in a list."""
+    return min(options, key=lambda x: abs(x - value))
+def build_camera_prompt(azimuth: float, elevation: float, distance: float) -> str:
+    """
+    Build a camera prompt from azimuth, elevation, and distance values.
+    Args:
+        azimuth: Horizontal rotation in degrees (0-360)
+        elevation: Vertical angle in degrees (-30 to 60)
+        distance: Distance factor (0.6 to 1.8)
+    Returns:
+        Formatted prompt string for the LoRA
+    """
+    # Snap to nearest valid values
+    azimuth_snapped = snap_to_nearest(azimuth, list(AZIMUTH_MAP.keys()))
+    elevation_snapped = snap_to_nearest(elevation, list(ELEVATION_MAP.keys()))
+    distance_snapped = snap_to_nearest(distance, list(DISTANCE_MAP.keys()))
+    azimuth_name = AZIMUTH_MAP[azimuth_snapped]
+    elevation_name = ELEVATION_MAP[elevation_snapped]
+    distance_name = DISTANCE_MAP[distance_snapped]
+    return f"<sks> {azimuth_name} {elevation_name} {distance_name}"
+from gradio.events import Dependency
+# --- 3D Camera Control Component ---
+class CameraControl3D(gr.HTML):
+    """
+    A 3D camera control component using Three.js.
+    Outputs: { azimuth: number, elevation: number, distance: number }
+    Accepts imageUrl prop to display user's uploaded image on the plane.
+    """
+    def __init__(self, value=None, imageUrl=None, **kwargs):
+        if value is None:
+            value = {"azimuth": 0, "elevation": 0, "distance": 1.0}
+        html_template = """
+        <script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>
+        <div id="camera-control-wrapper" style="width: 100%; height: 450px; position: relative; background: #1a1a1a; border-radius: 12px; overflow: hidden;">
+            <div id="prompt-overlay" style="position: absolute; bottom: 10px; left: 50%; transform: translateX(-50%); background: rgba(0,0,0,0.8); padding: 8px 16px; border-radius: 8px; font-family: monospace; font-size: 12px; color: #00ff88; white-space: nowrap; z-index: 10;"></div>
+        </div>
+        """
+        js_on_load = """
+        (() => {
+            const wrapper = element.querySelector('#camera-control-wrapper');
+            const promptOverlay = element.querySelector('#prompt-overlay');
+            // Wait for THREE to load
+            const initScene = () => {
+                if (typeof THREE === 'undefined') {
+                    setTimeout(initScene, 100);
+                    return;
+                }
+                // Scene setup
+                const scene = new THREE.Scene();
+                scene.background = new THREE.Color(0x1a1a1a);
+                const camera = new THREE.PerspectiveCamera(50, wrapper.clientWidth / wrapper.clientHeight, 0.1, 1000);
+                camera.position.set(4.5, 3, 4.5);
+                camera.lookAt(0, 0.75, 0);
+                const renderer = new THREE.WebGLRenderer({ antialias: true });
+                renderer.setSize(wrapper.clientWidth, wrapper.clientHeight);
+                renderer.setPixelRatio(Math.min(window.devicePixelRatio, 2));
+                wrapper.insertBefore(renderer.domElement, promptOverlay);
+                // Lighting
+                scene.add(new THREE.AmbientLight(0xffffff, 0.6));
+                const dirLight = new THREE.DirectionalLight(0xffffff, 0.6);
+                dirLight.position.set(5, 10, 5);
+                scene.add(dirLight);
+                // Grid
+                scene.add(new THREE.GridHelper(8, 16, 0x333333, 0x222222));
+                // Constants - reduced distances for tighter framing
+                const CENTER = new THREE.Vector3(0, 0.75, 0);
+                const BASE_DISTANCE = 1.6;
+                const AZIMUTH_RADIUS = 2.4;
+                const ELEVATION_RADIUS = 1.8;
+                // State
+                let azimuthAngle = props.value?.azimuth || 0;
+                let elevationAngle = props.value?.elevation || 0;
+                let distanceFactor = props.value?.distance || 1.0;
+                // Mappings - reduced wide shot multiplier
+                const azimuthSteps = [0, 45, 90, 135, 180, 225, 270, 315];
+                const elevationSteps = [-30, 0, 30, 60];
+                const distanceSteps = [0.6, 1.0, 1.4];
+                const azimuthNames = {
+                    0: 'front view', 45: 'front-right quarter view', 90: 'right side view',
+                    135: 'back-right quarter view', 180: 'back view', 225: 'back-left quarter view',
+                    270: 'left side view', 315: 'front-left quarter view'
+                };
+                const elevationNames = { '-30': 'low-angle shot', '0': 'eye-level shot', '30': 'elevated shot', '60': 'high-angle shot' };
+                const distanceNames = { '0.6': 'close-up', '1': 'medium shot', '1.4': 'wide shot' };
+                function snapToNearest(value, steps) {
+                    return steps.reduce((prev, curr) => Math.abs(curr - value) < Math.abs(prev - value) ? curr : prev);
+                }
+                // Create placeholder texture (smiley face)
+                function createPlaceholderTexture() {
+                    const canvas = document.createElement('canvas');
+                    canvas.width = 256;
+                    canvas.height = 256;
+                    const ctx = canvas.getContext('2d');
+                    ctx.fillStyle = '#3a3a4a';
+                    ctx.fillRect(0, 0, 256, 256);
+                    ctx.fillStyle = '#ffcc99';
+                    ctx.beginPath();
+                    ctx.arc(128, 128, 80, 0, Math.PI * 2);
+                    ctx.fill();
+                    ctx.fillStyle = '#333';
+                    ctx.beginPath();
+                    ctx.arc(100, 110, 10, 0, Math.PI * 2);
+                    ctx.arc(156, 110, 10, 0, Math.PI * 2);
+                    ctx.fill();
+                    ctx.strokeStyle = '#333';
+                    ctx.lineWidth = 3;
+                    ctx.beginPath();
+                    ctx.arc(128, 130, 35, 0.2, Math.PI - 0.2);
+                    ctx.stroke();
+                    return new THREE.CanvasTexture(canvas);
+                }
+                // Target image plane
+                let currentTexture = createPlaceholderTexture();
+                const planeMaterial = new THREE.MeshBasicMaterial({ map: currentTexture, side: THREE.DoubleSide });
+                let targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(1.2, 1.2), planeMaterial);
+                targetPlane.position.copy(CENTER);
+                scene.add(targetPlane);
+                // Function to update texture from image URL
+                function updateTextureFromUrl(url) {
+                    if (!url) {
+                        // Reset to placeholder
+                        planeMaterial.map = createPlaceholderTexture();
+                        planeMaterial.needsUpdate = true;
+                        // Reset plane to square
+                        scene.remove(targetPlane);
+                        targetPlane = new THREE.Mesh(new THREE.PlaneGeometry(1.2, 1.2), planeMaterial);
+                        targetPlane.position.copy(CENTER);
+                        scene.add(targetPlane);
+                        return;
+                    }
+                    const loader = new THREE.TextureLoader();
+                    loader.crossOrigin = 'anonymous';
+                    loader.load(url, (texture) => {
+                        texture.minFilter = THREE.LinearFilter;
+                        texture.magFilter = THREE.LinearFilter;
+                        planeMaterial.map = texture;
+                        planeMaterial.needsUpdate = true;
+                        // Adjust plane aspect ratio to match image
+                        const img = texture.image;
+                        if (img && img.width && img.height) {
+                            const aspect = img.width / img.height;
+                            const maxSize = 1.5;
+                            let planeWidth, planeHeight;
+                            if (aspect > 1) {
+                                planeWidth = maxSize;
+                                planeHeight = maxSize / aspect;
+                            } else {
+                                planeHeight = maxSize;
+                                planeWidth = maxSize * aspect;
+                            }
+                            scene.remove(targetPlane);
+                            targetPlane = new THREE.Mesh(
+                                new THREE.PlaneGeometry(planeWidth, planeHeight),
+                                planeMaterial
+                            );
+                            targetPlane.position.copy(CENTER);
+                            scene.add(targetPlane);
+                        }
+                    }, undefined, (err) => {
+                        console.error('Failed to load texture:', err);
+                    });
+                }
+                // Check for initial imageUrl
+                if (props.imageUrl) {
+                    updateTextureFromUrl(props.imageUrl);
+                }
+                // Camera model
+                const cameraGroup = new THREE.Group();
+                const bodyMat = new THREE.MeshStandardMaterial({ color: 0x6699cc, metalness: 0.5, roughness: 0.3 });
+                const body = new THREE.Mesh(new THREE.BoxGeometry(0.3, 0.22, 0.38), bodyMat);
+                cameraGroup.add(body);
+                const lens = new THREE.Mesh(
+                    new THREE.CylinderGeometry(0.09, 0.11, 0.18, 16),
+                    new THREE.MeshStandardMaterial({ color: 0x6699cc, metalness: 0.5, roughness: 0.3 })
+                );
+                lens.rotation.x = Math.PI / 2;
+                lens.position.z = 0.26;
+                cameraGroup.add(lens);
+                scene.add(cameraGroup);
+                // GREEN: Azimuth ring
+                const azimuthRing = new THREE.Mesh(
+                    new THREE.TorusGeometry(AZIMUTH_RADIUS, 0.04, 16, 64),
+                    new THREE.MeshStandardMaterial({ color: 0x00ff88, emissive: 0x00ff88, emissiveIntensity: 0.3 })
+                );
+                azimuthRing.rotation.x = Math.PI / 2;
+                azimuthRing.position.y = 0.05;
+                scene.add(azimuthRing);
+                const azimuthHandle = new THREE.Mesh(
+                    new THREE.SphereGeometry(0.18, 16, 16),
+                    new THREE.MeshStandardMaterial({ color: 0x00ff88, emissive: 0x00ff88, emissiveIntensity: 0.5 })
+                );
+                azimuthHandle.userData.type = 'azimuth';
+                scene.add(azimuthHandle);
+                // PINK: Elevation arc
+                const arcPoints = [];
+                for (let i = 0; i <= 32; i++) {
+                    const angle = THREE.MathUtils.degToRad(-30 + (90 * i / 32));
+                    arcPoints.push(new THREE.Vector3(-0.8, ELEVATION_RADIUS * Math.sin(angle) + CENTER.y, ELEVATION_RADIUS * Math.cos(angle)));
+                }
+                const arcCurve = new THREE.CatmullRomCurve3(arcPoints);
+                const elevationArc = new THREE.Mesh(
+                    new THREE.TubeGeometry(arcCurve, 32, 0.04, 8, false),
+                    new THREE.MeshStandardMaterial({ color: 0xff69b4, emissive: 0xff69b4, emissiveIntensity: 0.3 })
+                );
+                scene.add(elevationArc);
+                const elevationHandle = new THREE.Mesh(
+                    new THREE.SphereGeometry(0.18, 16, 16),
+                    new THREE.MeshStandardMaterial({ color: 0xff69b4, emissive: 0xff69b4, emissiveIntensity: 0.5 })
+                );
+                elevationHandle.userData.type = 'elevation';
+                scene.add(elevationHandle);
+                // ORANGE: Distance line & handle
+                const distanceLineGeo = new THREE.BufferGeometry();
+                const distanceLine = new THREE.Line(distanceLineGeo, new THREE.LineBasicMaterial({ color: 0xffa500 }));
+                scene.add(distanceLine);
+                const distanceHandle = new THREE.Mesh(
+                    new THREE.SphereGeometry(0.18, 16, 16),
+                    new THREE.MeshStandardMaterial({ color: 0xffa500, emissive: 0xffa500, emissiveIntensity: 0.5 })
+                );
+                distanceHandle.userData.type = 'distance';
+                scene.add(distanceHandle);
+                function updatePositions() {
+                    const distance = BASE_DISTANCE * distanceFactor;
+                    const azRad = THREE.MathUtils.degToRad(azimuthAngle);
+                    const elRad = THREE.MathUtils.degToRad(elevationAngle);
+                    const camX = distance * Math.sin(azRad) * Math.cos(elRad);
+                    const camY = distance * Math.sin(elRad) + CENTER.y;
+                    const camZ = distance * Math.cos(azRad) * Math.cos(elRad);
+                    cameraGroup.position.set(camX, camY, camZ);
+                    cameraGroup.lookAt(CENTER);
+                    azimuthHandle.position.set(AZIMUTH_RADIUS * Math.sin(azRad), 0.05, AZIMUTH_RADIUS * Math.cos(azRad));
+                    elevationHandle.position.set(-0.8, ELEVATION_RADIUS * Math.sin(elRad) + CENTER.y, ELEVATION_RADIUS * Math.cos(elRad));
+                    const orangeDist = distance - 0.5;
+                    distanceHandle.position.set(
+                        orangeDist * Math.sin(azRad) * Math.cos(elRad),
+                        orangeDist * Math.sin(elRad) + CENTER.y,
+                        orangeDist * Math.cos(azRad) * Math.cos(elRad)
+                    );
+                    distanceLineGeo.setFromPoints([cameraGroup.position.clone(), CENTER.clone()]);
+                    // Update prompt
+                    const azSnap = snapToNearest(azimuthAngle, azimuthSteps);
+                    const elSnap = snapToNearest(elevationAngle, elevationSteps);
+                    const distSnap = snapToNearest(distanceFactor, distanceSteps);
+                    const distKey = distSnap === 1 ? '1' : distSnap.toFixed(1);
+                    const prompt = '<sks> ' + azimuthNames[azSnap] + ' ' + elevationNames[String(elSnap)] + ' ' + distanceNames[distKey];
+                    promptOverlay.textContent = prompt;
+                }
+                function updatePropsAndTrigger() {
+                    const azSnap = snapToNearest(azimuthAngle, azimuthSteps);
+                    const elSnap = snapToNearest(elevationAngle, elevationSteps);
+                    const distSnap = snapToNearest(distanceFactor, distanceSteps);
+                    props.value = { azimuth: azSnap, elevation: elSnap, distance: distSnap };
+                    trigger('change', props.value);
+                }
+                // Raycasting
+                const raycaster = new THREE.Raycaster();
+                const mouse = new THREE.Vector2();
+                let isDragging = false;
+                let dragTarget = null;
+                let dragStartMouse = new THREE.Vector2();
+                let dragStartDistance = 1.0;
+                const intersection = new THREE.Vector3();
+                const canvas = renderer.domElement;
+                canvas.addEventListener('mousedown', (e) => {
+                    const rect = canvas.getBoundingClientRect();
+                    mouse.x = ((e.clientX - rect.left) / rect.width) * 2 - 1;
+                    mouse.y = -((e.clientY - rect.top) / rect.height) * 2 + 1;
+                    raycaster.setFromCamera(mouse, camera);
+                    const intersects = raycaster.intersectObjects([azimuthHandle, elevationHandle, distanceHandle]);
+                    if (intersects.length > 0) {
+                        isDragging = true;
+                        dragTarget = intersects[0].object;
+                        dragTarget.material.emissiveIntensity = 1.0;
+                        dragTarget.scale.setScalar(1.3);
+                        dragStartMouse.copy(mouse);
+                        dragStartDistance = distanceFactor;
+                        canvas.style.cursor = 'grabbing';
+                    }
+                });
+                canvas.addEventListener('mousemove', (e) => {
+                    const rect = canvas.getBoundingClientRect();
+                    mouse.x = ((e.clientX - rect.left) / rect.width) * 2 - 1;
+                    mouse.y = -((e.clientY - rect.top) / rect.height) * 2 + 1;
+                    if (isDragging && dragTarget) {
+                        raycaster.setFromCamera(mouse, camera);
+                        if (dragTarget.userData.type === 'azimuth') {
+                            const plane = new THREE.Plane(new THREE.Vector3(0, 1, 0), -0.05);
+                            if (raycaster.ray.intersectPlane(plane, intersection)) {
+                                azimuthAngle = THREE.MathUtils.radToDeg(Math.atan2(intersection.x, intersection.z));
+                                if (azimuthAngle < 0) azimuthAngle += 360;
+                            }
+                        } else if (dragTarget.userData.type === 'elevation') {
+                            const plane = new THREE.Plane(new THREE.Vector3(1, 0, 0), -0.8);
+                            if (raycaster.ray.intersectPlane(plane, intersection)) {
+                                const relY = intersection.y - CENTER.y;
+                                const relZ = intersection.z;
+                                elevationAngle = THREE.MathUtils.clamp(THREE.MathUtils.radToDeg(Math.atan2(relY, relZ)), -30, 60);
+                            }
+                        } else if (dragTarget.userData.type === 'distance') {
+                            const deltaY = mouse.y - dragStartMouse.y;
+                            distanceFactor = THREE.MathUtils.clamp(dragStartDistance - deltaY * 1.5, 0.6, 1.4);
+                        }
+                        updatePositions();
+                    } else {
+                        raycaster.setFromCamera(mouse, camera);
+                        const intersects = raycaster.intersectObjects([azimuthHandle, elevationHandle, distanceHandle]);
+                        [azimuthHandle, elevationHandle, distanceHandle].forEach(h => {
+                            h.material.emissiveIntensity = 0.5;
+                            h.scale.setScalar(1);
+                        });
+                        if (intersects.length > 0) {
+                            intersects[0].object.material.emissiveIntensity = 0.8;
+                            intersects[0].object.scale.setScalar(1.1);
+                            canvas.style.cursor = 'grab';
+                        } else {
+                            canvas.style.cursor = 'default';
+                        }
+                    }
+                });
+                const onMouseUp = () => {
+                    if (dragTarget) {
+                        dragTarget.material.emissiveIntensity = 0.5;
+                        dragTarget.scale.setScalar(1);
+                        // Snap and animate
+                        const targetAz = snapToNearest(azimuthAngle, azimuthSteps);
+                        const targetEl = snapToNearest(elevationAngle, elevationSteps);
+                        const targetDist = snapToNearest(distanceFactor, distanceSteps);
+                        const startAz = azimuthAngle, startEl = elevationAngle, startDist = distanceFactor;
+                        const startTime = Date.now();
+                        function animateSnap() {
+                            const t = Math.min((Date.now() - startTime) / 200, 1);
+                            const ease = 1 - Math.pow(1 - t, 3);
+                            let azDiff = targetAz - startAz;
+                            if (azDiff > 180) azDiff -= 360;
+                            if (azDiff < -180) azDiff += 360;
+                            azimuthAngle = startAz + azDiff * ease;
+                            if (azimuthAngle < 0) azimuthAngle += 360;
+                            if (azimuthAngle >= 360) azimuthAngle -= 360;
+                            elevationAngle = startEl + (targetEl - startEl) * ease;
+                            distanceFactor = startDist + (targetDist - startDist) * ease;
+                            updatePositions();
+                            if (t < 1) requestAnimationFrame(animateSnap);
+                            else updatePropsAndTrigger();
+                        }
+                        animateSnap();
+                    }
+                    isDragging = false;
+                    dragTarget = null;
+                    canvas.style.cursor = 'default';
+                };
+                canvas.addEventListener('mouseup', onMouseUp);
+                canvas.addEventListener('mouseleave', onMouseUp);
+                // Touch support for mobile
+                canvas.addEventListener('touchstart', (e) => {
+                    e.preventDefault();
+                    const touch = e.touches[0];
+                    const rect = canvas.getBoundingClientRect();
+                    mouse.x = ((touch.clientX - rect.left) / rect.width) * 2 - 1;
+                    mouse.y = -((touch.clientY - rect.top) / rect.height) * 2 + 1;
+                    raycaster.setFromCamera(mouse, camera);
+                    const intersects = raycaster.intersectObjects([azimuthHandle, elevationHandle, distanceHandle]);
+                    if (intersects.length > 0) {
+                        isDragging = true;
+                        dragTarget = intersects[0].object;
+                        dragTarget.material.emissiveIntensity = 1.0;
+                        dragTarget.scale.setScalar(1.3);
+                        dragStartMouse.copy(mouse);
+                        dragStartDistance = distanceFactor;
+                    }
+                }, { passive: false });
+                canvas.addEventListener('touchmove', (e) => {
+                    e.preventDefault();
+                    const touch = e.touches[0];
+                    const rect = canvas.getBoundingClientRect();
+                    mouse.x = ((touch.clientX - rect.left) / rect.width) * 2 - 1;
+                    mouse.y = -((touch.clientY - rect.top) / rect.height) * 2 + 1;
+                    if (isDragging && dragTarget) {
+                        raycaster.setFromCamera(mouse, camera);
+                        if (dragTarget.userData.type === 'azimuth') {
+                            const plane = new THREE.Plane(new THREE.Vector3(0, 1, 0), -0.05);
+                            if (raycaster.ray.intersectPlane(plane, intersection)) {
+                                azimuthAngle = THREE.MathUtils.radToDeg(Math.atan2(intersection.x, intersection.z));
+                                if (azimuthAngle < 0) azimuthAngle += 360;
+                            }
+                        } else if (dragTarget.userData.type === 'elevation') {
+                            const plane = new THREE.Plane(new THREE.Vector3(1, 0, 0), -0.8);
+                            if (raycaster.ray.intersectPlane(plane, intersection)) {
+                                const relY = intersection.y - CENTER.y;
+                                const relZ = intersection.z;
+                                elevationAngle = THREE.MathUtils.clamp(THREE.MathUtils.radToDeg(Math.atan2(relY, relZ)), -30, 60);
+                            }
+                        } else if (dragTarget.userData.type === 'distance') {
+                            const deltaY = mouse.y - dragStartMouse.y;
+                            distanceFactor = THREE.MathUtils.clamp(dragStartDistance - deltaY * 1.5, 0.6, 1.4);
+                        }
+                        updatePositions();
+                    }
+                }, { passive: false });
+                canvas.addEventListener('touchend', (e) => {
+                    e.preventDefault();
+                    onMouseUp();
+                }, { passive: false });
+                canvas.addEventListener('touchcancel', (e) => {
+                    e.preventDefault();
+                    onMouseUp();
+                }, { passive: false });
+                // Initial update
+                updatePositions();
+                // Render loop
+                function render() {
+                    requestAnimationFrame(render);
+                    renderer.render(scene, camera);
+                }
+                render();
+                // Handle resize
+                new ResizeObserver(() => {
+                    camera.aspect = wrapper.clientWidth / wrapper.clientHeight;
+                    camera.updateProjectionMatrix();
+                    renderer.setSize(wrapper.clientWidth, wrapper.clientHeight);
+                }).observe(wrapper);
+                // Store update functions for external calls
+                wrapper._updateFromProps = (newVal) => {
+                    if (newVal && typeof newVal === 'object') {
+                        azimuthAngle = newVal.azimuth ?? azimuthAngle;
+                        elevationAngle = newVal.elevation ?? elevationAngle;
+                        distanceFactor = newVal.distance ?? distanceFactor;
+                        updatePositions();
+                    }
+                };
+                wrapper._updateTexture = updateTextureFromUrl;
+                // Watch for prop changes (imageUrl and value)
+                let lastImageUrl = props.imageUrl;
+                let lastValue = JSON.stringify(props.value);
+                setInterval(() => {
+                    // Check imageUrl changes
+                    if (props.imageUrl !== lastImageUrl) {
+                        lastImageUrl = props.imageUrl;
+                        updateTextureFromUrl(props.imageUrl);
+                    }
+                    // Check value changes (from sliders)
+                    const currentValue = JSON.stringify(props.value);
+                    if (currentValue !== lastValue) {
+                        lastValue = currentValue;
+                        if (props.value && typeof props.value === 'object') {
+                            azimuthAngle = props.value.azimuth ?? azimuthAngle;
+                            elevationAngle = props.value.elevation ?? elevationAngle;
+                            distanceFactor = props.value.distance ?? distanceFactor;
+                            updatePositions();
+                        }
+                    }
+                }, 100);
+            };
+            initScene();
+        })();
+        """
+        super().__init__(
+            value=value,
+            html_template=html_template,
+            js_on_load=js_on_load,
+            imageUrl=imageUrl,
+            **kwargs
+        )
+    from typing import Callable, Literal, Sequence, Any, TYPE_CHECKING
+    from gradio.blocks import Block
+    if TYPE_CHECKING:
+        from gradio.components import Timer
+        from gradio.components.base import Component

examples/1.jpg ADDED Viewed

examples/10.jpeg ADDED Viewed

examples/11.jpg ADDED Viewed

examples/12.jpg ADDED Viewed

examples/13.jpg ADDED Viewed

Git LFS Details

SHA256: d54e023ee72ab14ca3180c3f0c1707234845cc4886adcbc7aa3039914ed4759e
Pointer size: 131 Bytes
Size of remote file: 197 kB

examples/14.jpg ADDED Viewed

examples/2.jpeg ADDED Viewed

examples/4.jpg ADDED Viewed

examples/5.jpg ADDED Viewed

examples/6.jpg ADDED Viewed

examples/7.jpg ADDED Viewed

examples/8.jpg ADDED Viewed

examples/9.jpg ADDED Viewed

examples/ELS.jpg ADDED Viewed

pre-requirements.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ pip>=23.0.0

qwenimage/__init__.py ADDED Viewed

File without changes

qwenimage/pipeline_qwenimage_edit_plus.py ADDED Viewed

	@@ -0,0 +1,900 @@

+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import math
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+import torch.nn.functional as F
+from PIL import Image, ImageOps
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, Qwen2VLProcessor
+from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
+from diffusers.loaders import QwenImageLoraLoaderMixin
+from diffusers.models import AutoencoderKLQwenImage, QwenImageTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.qwenimage.pipeline_output import QwenImagePipelineOutput
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+Examples:
+```py
+>>> import torch
+>>> from diffusers import QwenImageEditPlusPipeline
+>>> from diffusers.utils import load_image
+>>> pipe = QwenImageEditPlusPipeline.from_pretrained(
+...     "Qwen/Qwen-Image-Edit-2509", torch_dtype=torch.bfloat16
+... ).to("cuda")
+>>> image = load_image(
+... "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/yarn-art-pikachu.png"
+... ).convert("RGB")
+>>> prompt = "Make Pikachu hold a sign that says 'Qwen Edit is awesome', yarn art style, detailed, vibrant colors"
+>>> out = pipe(image=image, prompt=prompt, num_inference_steps=50).images[0]
+>>> out.save("qwenimage_edit_plus.png")
+```
+"""
+CONDITION_IMAGE_SIZE = 384 * 384
+VAE_IMAGE_SIZE = 1024 * 1024
+def pad_to_aspect(img: Image.Image, target_w: int, target_h: int) -> Image.Image:
+    """Pad (letterbox) to target aspect ratio without warping."""
+    return ImageOps.pad(
+        img.convert("RGB"),
+        (int(target_w), int(target_h)),
+        method=Image.Resampling.LANCZOS,
+        color=(0, 0, 0),
+        centering=(0.5, 0.5),
+    )
+def choose_condition_area(canvas_area: int, base_area: int = CONDITION_IMAGE_SIZE) -> int:
+    """Choose a conditioning target area derived from canvas area with sensible bounds."""
+    scaled = int(canvas_area * (base_area / (1024 * 1024)))
+    return int(min(base_area, max(256 * 256, scaled)))
+# Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.calculate_shift
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one.")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom timesteps."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom sigmas."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    if hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    raise AttributeError("Could not access latents of provided encoder_output")
+def calculate_dimensions(target_area: int, ratio: float, multiple: int = 32):
+    """
+    Area-based sizing while snapping to a chosen lattice multiple.
+    Used for canvas sizing AND conditioning sizing (anti-drift).
+    """
+    m = int(multiple) if multiple else 32
+    m = max(1, m)
+    width = math.sqrt(float(target_area) * float(ratio))
+    height = width / float(ratio)
+    width = round(width / m) * m
+    height = round(height / m) * m
+    return int(width), int(height)
+# Optional: decoder VAE (Wan2x)
+_ALT_VAE_WAN2X = None
+# Track desired tiling state for the optional decoder VAE, so it stays consistent across lazy loads.
+_ALT_VAE_WAN2X_TILING_ENABLED = False
+def _set_vae_tiling(model: Any, enabled: bool) -> bool:
+    """
+    Best-effort tiling toggle for a VAE-like module.
+    Returns True if a tiling method existed and was called, False otherwise.
+    """
+    if model is None:
+        return False
+    try:
+        if enabled:
+            if hasattr(model, "enable_tiling"):
+                model.enable_tiling()
+                return True
+            if hasattr(model, "enable_vae_tiling"):
+                model.enable_vae_tiling()
+                return True
+        else:
+            if hasattr(model, "disable_tiling"):
+                model.disable_tiling()
+                return True
+            if hasattr(model, "disable_vae_tiling"):
+                model.disable_vae_tiling()
+                return True
+    except Exception as e:
+        # Don't hard-fail inference if tiling toggle fails for an alt decoder.
+        logger.warning(f"VAE tiling toggle failed on {type(model)}: {e}")
+        return False
+    return False
+def _get_wan2x_vae(device: torch.device, dtype: torch.dtype):
+    """
+    Decoder-only finetune that outputs 2x resolution via pixel-shuffle.
+    Lazy-loaded so it doesn't impact startup unless used.
+    """
+    global _ALT_VAE_WAN2X, _ALT_VAE_WAN2X_TILING_ENABLED
+    if _ALT_VAE_WAN2X is None:
+        from diffusers import AutoencoderKLWan
+        _ALT_VAE_WAN2X = AutoencoderKLWan.from_pretrained(
+            "spacepxl/Wan2.1-VAE-upscale2x",
+            subfolder="diffusers/Wan2.1_VAE_upscale2x_imageonly_real_v1",
+            torch_dtype=dtype,
+        )
+        _ALT_VAE_WAN2X.eval()
+        # Apply last requested tiling immediately on first load (if supported).
+        _set_vae_tiling(_ALT_VAE_WAN2X, _ALT_VAE_WAN2X_TILING_ENABLED)
+    _ALT_VAE_WAN2X = _ALT_VAE_WAN2X.to(device=device, dtype=dtype)
+    # Re-apply after moving to device, just in case.
+    _set_vae_tiling(_ALT_VAE_WAN2X, _ALT_VAE_WAN2X_TILING_ENABLED)
+    return _ALT_VAE_WAN2X
+class QwenImageEditPlusPipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
+    r"""
+    The Qwen-Image-Edit pipeline for image editing.
+    """
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKLQwenImage,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: Qwen2Tokenizer,
+        processor: Qwen2VLProcessor,
+        transformer: QwenImageTransformer2DModel,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            processor=processor,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        self.latent_channels = self.vae.config.z_dim if getattr(self, "vae", None) else 16
+        # QwenImage latents are turned into 2x2 patches and packed; multiply scale-factor by patch size
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = 1024
+        # Track tiling state (applies to both primary VAE and optional decoder VAE)
+        self._vae_tiling_enabled = False
+        self.prompt_template_encode = (
+            "<|im_start|>system\n"
+            "Describe the key features of the input image (color, shape, size, texture, objects, background), "
+            "then explain how the user's text instruction should alter or modify the image.\n"
+            "Generate a new image that meets the user's requirements while maintaining consistency with the original input where appropriate."
+            "<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        )
+        self.prompt_template_encode_start_idx = 64
+        self.default_sample_size = 128
+        # ------------------------------------------------------------
+        # VAE tiling control (applies to both primary VAE and optional decoder VAE)
+        # ------------------------------------------------------------
+        # Expose a stable API so app.py can call pipe.enable_vae_tiling()/disable_vae_tiling()
+        # regardless of which decoder VAE is selected at runtime.
+    def set_vae_tiling(self, enabled: bool) -> None:
+        global _ALT_VAE_WAN2X_TILING_ENABLED, _ALT_VAE_WAN2X
+        enabled = bool(enabled)
+        self._vae_tiling_enabled = enabled
+        # 1) Primary VAE (Qwen)
+        _set_vae_tiling(getattr(self, "vae", None), enabled)
+        # 2) Optional decoder VAE (Wan2x): store desired global state; apply now if already loaded.
+        _ALT_VAE_WAN2X_TILING_ENABLED = enabled
+        if _ALT_VAE_WAN2X is not None:
+            _set_vae_tiling(_ALT_VAE_WAN2X, enabled)
+    def enable_vae_tiling(self) -> None:
+        self.set_vae_tiling(True)
+    def disable_vae_tiling(self) -> None:
+        self.set_vae_tiling(False)
+    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage.QwenImagePipeline._extract_masked_hidden
+    def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor):
+        bool_mask = mask.bool()
+        valid_lengths = bool_mask.sum(dim=1)
+        selected = hidden_states[bool_mask]
+        split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
+        return split_result
+    def _get_qwen_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        image: Optional[torch.Tensor] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        img_prompt_template = "Picture {}: <|vision_start|><|image_pad|><|vision_end|>"
+        if isinstance(image, list):
+            base_img_prompt = ""
+            for i, _ in enumerate(image):
+                base_img_prompt += img_prompt_template.format(i + 1)
+        elif image is not None:
+            base_img_prompt = img_prompt_template.format(1)
+        else:
+            base_img_prompt = ""
+        template = self.prompt_template_encode
+        drop_idx = self.prompt_template_encode_start_idx
+        txt = [template.format(base_img_prompt + e) for e in prompt]
+        model_inputs = self.processor(text=txt, images=image, padding=True, return_tensors="pt").to(device)
+        outputs = self.text_encoder(
+            input_ids=model_inputs.input_ids,
+            attention_mask=model_inputs.attention_mask,
+            pixel_values=model_inputs.pixel_values,
+            image_grid_thw=model_inputs.image_grid_thw,
+            output_hidden_states=True,
+        )
+        hidden_states = outputs.hidden_states[-1]
+        split_hidden_states = self._extract_masked_hidden(hidden_states, model_inputs.attention_mask)
+        split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+        attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+        max_seq_len = max([e.size(0) for e in split_hidden_states])
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
+        )
+        encoder_attention_mask = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
+        )
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        return prompt_embeds, encoder_attention_mask
+    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.encode_prompt
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        image: Optional[torch.Tensor] = None,
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 1024,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
+        if prompt_embeds is None:
+            prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, image, device)
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+        return prompt_embeds, prompt_embeds_mask
+    # Copied from diffusers.pipelines.qwenimage.pipeline_qwenimage_edit.QwenImageEditPipeline.check_inputs
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_embeds_mask=None,
+        negative_prompt_embeds_mask=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. "
+                "Dimensions will be resized accordingly."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found "
+                f"{[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError("Cannot forward both `prompt` and `prompt_embeds`.")
+        if prompt is None and prompt_embeds is None:
+            raise ValueError("Provide either `prompt` or `prompt_embeds`.")
+        if prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError("Cannot forward both `negative_prompt` and `negative_prompt_embeds`.")
+        if prompt_embeds is not None and prompt_embeds_mask is None:
+            raise ValueError("If `prompt_embeds` are provided, `prompt_embeds_mask` must also be passed.")
+        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
+            raise ValueError("If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` must also be passed.")
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, _, channels = latents.shape
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // 4, 1, height, width)
+        return latents
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i : i + 1]), generator=generator[i], sample_mode="argmax")
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator, sample_mode="argmax")
+        latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, self.latent_channels, 1, 1, 1).to(
+            image_latents.device, image_latents.dtype
+        )
+        latents_std = torch.tensor(self.vae.config.latents_std).view(1, self.latent_channels, 1, 1, 1).to(
+            image_latents.device, image_latents.dtype
+        )
+        image_latents = (image_latents - latents_mean) / latents_std
+        return image_latents
+    def prepare_latents(
+        self,
+        images,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, 1, num_channels_latents, height, width)
+        image_latents = None
+        if images is not None:
+            if not isinstance(images, list):
+                images = [images]
+            all_image_latents = []
+            for image in images:
+                image = image.to(device=device, dtype=dtype)
+                if image.shape[1] != self.latent_channels:
+                    image_latents = self._encode_vae_image(image=image, generator=generator)
+                else:
+                    image_latents = image
+                if batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] == 0:
+                    additional_image_per_prompt = batch_size // image_latents.shape[0]
+                    image_latents = torch.cat([image_latents] * additional_image_per_prompt, dim=0)
+                elif batch_size > image_latents.shape[0] and batch_size % image_latents.shape[0] != 0:
+                    raise ValueError(
+                        f"Cannot duplicate `image` of batch size {image_latents.shape[0]} to {batch_size} text prompts."
+                    )
+                image_latent_height, image_latent_width = image_latents.shape[3:]
+                image_latents = self._pack_latents(
+                    image_latents, batch_size, num_channels_latents, image_latent_height, image_latent_width
+                )
+                all_image_latents.append(image_latents)
+            image_latents = torch.cat(all_image_latents, dim=1)
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You passed a list of generators of length {len(generator)}, but requested an effective batch size of {batch_size}."
+            )
+        if latents is None:
+            latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+            latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        else:
+            latents = latents.to(device=device, dtype=dtype)
+        return latents, image_latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        image: Optional[PipelineImageInput] = None,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        true_cfg_scale: float = 4.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        condition_area: Optional[int] = None,
+        vae_image_indices: Optional[List[int]] = None,
+        pad_to_canvas: bool = True,
+        # NEW: lattice + VAE ref override
+        resolution_multiple: Optional[int] = None,
+        vae_ref_area: Optional[int] = None,
+        vae_ref_start_index: int = 2,
+        # Optional: decoder swap
+        decoder_vae: str = "qwen",  # "qwen" | "wan2x"
+        keep_decoder_2x: bool = False,
+        # standard args
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: Optional[float] = None,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+    ):
+        """Run Qwen-Image-Edit inference.
+        Examples:
+        """
+        # ---- determine input size ----
+        if isinstance(image, list):
+            image_size = image[0].size
+        else:
+            image_size = image.size
+        # Lattice multiple used throughout (canvas sizing + condition sizing)
+        multiple_of = int(resolution_multiple) if resolution_multiple is not None else (self.vae_scale_factor * 2)
+        multiple_of = max(1, multiple_of)
+        calculated_width, calculated_height = calculate_dimensions(
+            1024 * 1024, float(image_size[0]) / float(image_size[1]), multiple=multiple_of
+        )
+        height = height or calculated_height
+        width = width or calculated_width
+        width = (int(width) // multiple_of) * multiple_of
+        height = (int(height) // multiple_of) * multiple_of
+        # ---- validate ----
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        # ---- call params ----
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        # ---- preprocess ----
+        condition_images = None
+        vae_images = None
+        vae_image_sizes: List[tuple[int, int]] = []
+        # support pre-latent tensors (rare, but keep compatibility)
+        if image is not None and not (isinstance(image, torch.Tensor) and image.size(1) == self.latent_channels):
+            if not isinstance(image, list):
+                image = [image]
+            canvas_area = int(width) * int(height)
+            cond_area = int(condition_area) if condition_area is not None else choose_condition_area(canvas_area)
+            cond_w, cond_h = calculate_dimensions(cond_area, float(width) / float(height), multiple=multiple_of)
+            # Optional VAE ref override sizing (applied only to indices >= vae_ref_start_index)
+            ref_w = ref_h = None
+            if vae_ref_area is not None:
+                try:
+                    ref_w, ref_h = calculate_dimensions(
+                        int(vae_ref_area),
+                        float(width) / float(height),
+                        multiple=multiple_of,
+                    )
+                except Exception:
+                    ref_w = ref_h = None
+            condition_images = []
+            vae_images = []
+            if vae_image_indices is None:
+                vae_image_indices = list(range(len(image)))
+            vae_set = set(int(i) for i in vae_image_indices)
+            for idx, img in enumerate(image):
+                pil = img.convert("RGB") if isinstance(img, Image.Image) else img
+                if pad_to_canvas and isinstance(pil, Image.Image):
+                    pil = pad_to_aspect(pil, int(width), int(height))
+                # conditioning stream (always)
+                condition_images.append(self.image_processor.resize(pil, cond_h, cond_w))
+                # VAE stream (selective)
+                if idx in vae_set:
+                    if (ref_w is not None) and (ref_h is not None) and (int(idx) >= int(vae_ref_start_index)):
+                        vw, vh = int(ref_w), int(ref_h)
+                    else:
+                        vw, vh = int(width), int(height)
+                    vae_image_sizes.append((vw, vh))
+                    vae_images.append(self.image_processor.preprocess(pil, int(vh), int(vw)).unsqueeze(2))
+            has_neg_prompt = negative_prompt is not None or (
+                negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
+            )
+            if true_cfg_scale > 1 and not has_neg_prompt:
+                logger.warning(
+                    f"true_cfg_scale={true_cfg_scale} but CFG disabled because no negative prompt was provided."
+                )
+            if true_cfg_scale <= 1 and has_neg_prompt:
+                logger.warning("negative_prompt provided but CFG disabled because true_cfg_scale <= 1")
+            do_true_cfg = (true_cfg_scale > 1) and has_neg_prompt
+            prompt_embeds, prompt_embeds_mask = self.encode_prompt(
+                image=condition_images,
+                prompt=prompt,
+                prompt_embeds=prompt_embeds,
+                prompt_embeds_mask=prompt_embeds_mask,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+            )
+            if do_true_cfg:
+                negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
+                    image=condition_images,
+                    prompt=negative_prompt,
+                    prompt_embeds=negative_prompt_embeds,
+                    prompt_embeds_mask=negative_prompt_embeds_mask,
+                    device=device,
+                    num_images_per_prompt=num_images_per_prompt,
+                    max_sequence_length=max_sequence_length,
+                )
+            # ---- prepare latents ----
+            num_channels_latents = self.transformer.config.in_channels // 4
+            latents, image_latents = self.prepare_latents(
+                vae_images,
+                batch_size * num_images_per_prompt,
+                num_channels_latents,
+                height,
+                width,
+                prompt_embeds.dtype,
+                device,
+                generator,
+                latents,
+            )
+            img_shapes = [
+                [
+                    (1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2),
+                    *[
+                        (1, vae_h // self.vae_scale_factor // 2, vae_w // self.vae_scale_factor // 2)
+                        for (vae_w, vae_h) in vae_image_sizes
+                    ],
+                ]
+            ] * batch_size
+        else:
+            raise ValueError(
+                "This Space pipeline expects `image` as PIL/np inputs (not pre-latents) in this setup."
+            )
+        # ---- timesteps ----
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler, num_inference_steps, device, sigmas=sigmas, mu=mu
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # guidance-distilled models need explicit guidance input
+        if self.transformer.config.guidance_embeds and guidance_scale is None:
+            raise ValueError("guidance_scale is required for guidance-distilled model.")
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32).expand(latents.shape[0])
+        else:
+            if guidance_scale is not None:
+                logger.warning("guidance_scale passed but ignored since model is not guidance-distilled.")
+            guidance = None
+        if self.attention_kwargs is None:
+            self._attention_kwargs = {}
+        txt_seq_lens = prompt_embeds_mask.sum(dim=1).tolist() if prompt_embeds_mask is not None else None
+        image_rotary_emb = self.transformer.pos_embed(img_shapes, txt_seq_lens, device=latents.device)
+        do_true_cfg = (
+            (true_cfg_scale > 1)
+            and (negative_prompt_embeds is not None)
+            and (negative_prompt_embeds_mask is not None)
+        )
+        if do_true_cfg:
+            negative_txt_seq_lens = negative_prompt_embeds_mask.sum(dim=1).tolist()
+            uncond_image_rotary_emb = self.transformer.pos_embed(img_shapes, negative_txt_seq_lens, device=latents.device)
+        else:
+            uncond_image_rotary_emb = None
+        # ---- denoise ----
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                latent_model_input = latents
+                if image_latents is not None:
+                    latent_model_input = torch.cat([latents, image_latents], dim=1)
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
+                        hidden_states=latent_model_input,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        encoder_hidden_states_mask=prompt_embeds_mask,
+                        encoder_hidden_states=prompt_embeds,
+                        image_rotary_emb=image_rotary_emb,
+                        attention_kwargs=self.attention_kwargs,
+                        return_dict=False,
+                    )[0]
+                noise_pred = noise_pred[:, : latents.size(1)]
+                if do_true_cfg:
+                    with self.transformer.cache_context("uncond"):
+                        neg_noise_pred = self.transformer(
+                            hidden_states=latent_model_input,
+                            timestep=timestep / 1000,
+                            guidance=guidance,
+                            encoder_hidden_states_mask=negative_prompt_embeds_mask,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            image_rotary_emb=uncond_image_rotary_emb,
+                            attention_kwargs=self.attention_kwargs,
+                            return_dict=False,
+                        )[0]
+                    neg_noise_pred = neg_noise_pred[:, : latents.size(1)]
+                    comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                    cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                    noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
+                    noise_pred = comb_pred * (cond_norm / (noise_norm + 1e-8))
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype and torch.backends.mps.is_available():
+                    latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {k: locals()[k] for k in callback_on_step_end_tensor_inputs}
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        # ---- decode ----
+        if output_type == "latent":
+            image_out = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = latents.to(self.vae.dtype)
+            latents_mean = torch.tensor(self.vae.config.latents_mean).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            if decoder_vae == "wan2x":
+                alt_vae = _get_wan2x_vae(latents.device, self.vae.dtype)
+                decoder_out = alt_vae.decode(latents, return_dict=False)[0]  # [B, 12, F, H, W]
+                img_2x = F.pixel_shuffle(decoder_out[:, :, 0], upscale_factor=2)  # [B, 3, 2H, 2W]
+                if keep_decoder_2x:
+                    decoded = img_2x
+                else:
+                    decoded = F.interpolate(img_2x, size=(int(height), int(width)), mode="area")
+            else:
+                decoded = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
+            image_out = self.image_processor.postprocess(decoded, output_type=output_type)
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image_out,)
+        return QwenImagePipelineOutput(images=image_out)

qwenimage/qwen_fa3_processor.py ADDED Viewed

	@@ -0,0 +1,142 @@

+"""
+Paired with a good language model. Thanks!
+"""
+import torch
+from typing import Optional, Tuple
+from diffusers.models.transformers.transformer_qwenimage import apply_rotary_emb_qwen
+try:
+    from kernels import get_kernel
+    _k = get_kernel("kernels-community/vllm-flash-attn3")
+    _flash_attn_func = _k.flash_attn_func
+except Exception as e:
+    _flash_attn_func = None
+    _kernels_err = e
+def _ensure_fa3_available():
+    if _flash_attn_func is None:
+        raise ImportError(
+            "FlashAttention-3 via Hugging Face `kernels` is required. "
+            "Tried `get_kernel('kernels-community/vllm-flash-attn3')` and failed with:\n"
+            f"{_kernels_err}"
+        )
+@torch.library.custom_op("flash::flash_attn_func", mutates_args=())
+def flash_attn_func(
+    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, causal: bool = False
+) -> torch.Tensor:
+    outputs, lse = _flash_attn_func(q, k, v, causal=causal)
+    return outputs
+@flash_attn_func.register_fake
+def _(q, k, v, **kwargs):
+    # two outputs:
+    # 1. output: (batch, seq_len, num_heads, head_dim)
+    # 2. softmax_lse: (batch, num_heads, seq_len) with dtype=torch.float32
+    meta_q = torch.empty_like(q).contiguous()
+    return meta_q #, q.new_empty((q.size(0), q.size(2), q.size(1)), dtype=torch.float32)
+class QwenDoubleStreamAttnProcessorFA3:
+    """
+    FA3-based attention processor for Qwen double-stream architecture.
+    Computes joint attention over concatenated [text, image] streams using vLLM FlashAttention-3
+    accessed via Hugging Face `kernels`.
+    Notes / limitations:
+    - General attention masks are not supported here (FA3 path). `is_causal=False` and no arbitrary mask.
+    - Optional windowed attention / sink tokens / softcap can be plumbed through if you use those features.
+    - Expects an available `apply_rotary_emb_qwen` in scope (same as your non-FA3 processor).
+    """
+    _attention_backend = "fa3"  # for parity with your other processors, not used internally
+    def __init__(self):
+        _ensure_fa3_available()
+    @torch.no_grad()
+    def __call__(
+        self,
+        attn,  # Attention module with to_q/to_k/to_v/add_*_proj, norms, to_out, to_add_out, and .heads
+        hidden_states: torch.FloatTensor,                 # (B, S_img, D_model)  image stream
+        encoder_hidden_states: torch.FloatTensor = None,  # (B, S_txt, D_model)  text stream
+        encoder_hidden_states_mask: torch.FloatTensor = None,  # unused in FA3 path
+        attention_mask: Optional[torch.FloatTensor] = None,    # unused in FA3 path
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # (img_freqs, txt_freqs)
+    ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        if encoder_hidden_states is None:
+            raise ValueError("QwenDoubleStreamAttnProcessorFA3 requires encoder_hidden_states (text stream).")
+        if attention_mask is not None:
+            # FA3 kernel path here does not consume arbitrary masks; fail fast to avoid silent correctness issues.
+            raise NotImplementedError("attention_mask is not supported in this FA3 implementation.")
+        _ensure_fa3_available()
+        B, S_img, _ = hidden_states.shape
+        S_txt = encoder_hidden_states.shape[1]
+        # ---- QKV projections (image/sample stream) ----
+        img_q = attn.to_q(hidden_states)   # (B, S_img, D)
+        img_k = attn.to_k(hidden_states)
+        img_v = attn.to_v(hidden_states)
+        # ---- QKV projections (text/context stream) ----
+        txt_q = attn.add_q_proj(encoder_hidden_states)  # (B, S_txt, D)
+        txt_k = attn.add_k_proj(encoder_hidden_states)
+        txt_v = attn.add_v_proj(encoder_hidden_states)
+        # ---- Reshape to (B, S, H, D_h) ----
+        H = attn.heads
+        img_q = img_q.unflatten(-1, (H, -1))
+        img_k = img_k.unflatten(-1, (H, -1))
+        img_v = img_v.unflatten(-1, (H, -1))
+        txt_q = txt_q.unflatten(-1, (H, -1))
+        txt_k = txt_k.unflatten(-1, (H, -1))
+        txt_v = txt_v.unflatten(-1, (H, -1))
+        # ---- Q/K normalization (per your module contract) ----
+        if getattr(attn, "norm_q", None) is not None:
+            img_q = attn.norm_q(img_q)
+        if getattr(attn, "norm_k", None) is not None:
+            img_k = attn.norm_k(img_k)
+        if getattr(attn, "norm_added_q", None) is not None:
+            txt_q = attn.norm_added_q(txt_q)
+        if getattr(attn, "norm_added_k", None) is not None:
+            txt_k = attn.norm_added_k(txt_k)
+        # ---- RoPE (Qwen variant) ----
+        if image_rotary_emb is not None:
+            img_freqs, txt_freqs = image_rotary_emb
+            # expects tensors shaped (B, S, H, D_h)
+            img_q = apply_rotary_emb_qwen(img_q, img_freqs, use_real=False)
+            img_k = apply_rotary_emb_qwen(img_k, img_freqs, use_real=False)
+            txt_q = apply_rotary_emb_qwen(txt_q, txt_freqs, use_real=False)
+            txt_k = apply_rotary_emb_qwen(txt_k, txt_freqs, use_real=False)
+        # ---- Joint attention over [text, image] along sequence axis ----
+        # Shapes: (B, S_total, H, D_h)
+        q = torch.cat([txt_q, img_q], dim=1)
+        k = torch.cat([txt_k, img_k], dim=1)
+        v = torch.cat([txt_v, img_v], dim=1)
+        # FlashAttention-3 path expects (B, S, H, D_h) and returns (out, softmax_lse)
+        out = flash_attn_func(q, k, v, causal=False)  # out: (B, S_total, H, D_h)
+        # ---- Back to (B, S, D_model) ----
+        out = out.flatten(2, 3).to(q.dtype)
+        # Split back to text / image segments
+        txt_attn_out = out[:, :S_txt, :]
+        img_attn_out = out[:, S_txt:, :]
+        # ---- Output projections ----
+        img_attn_out = attn.to_out[0](img_attn_out)
+        if len(attn.to_out) > 1:
+            img_attn_out = attn.to_out[1](img_attn_out)  # dropout if present
+        txt_attn_out = attn.to_add_out(txt_attn_out)
+        return img_attn_out, txt_attn_out

qwenimage/transformer_qwenimage.py ADDED Viewed

	@@ -0,0 +1,642 @@

+# Copyright 2025 Qwen-Image Team, The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import math
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward, AttentionMixin
+from diffusers.models.attention_dispatch import dispatch_attention_fn
+from diffusers.models.attention_processor import Attention
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+) -> torch.Tensor:
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent).to(timesteps.dtype)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def apply_rotary_emb_qwen(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(1)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class QwenTimestepProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+    def forward(self, timestep, hidden_states):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))  # (N, D)
+        conditioning = timesteps_emb
+        return conditioning
+class QwenEmbedRope(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        pos_index = torch.arange(4096)
+        neg_index = torch.arange(4096).flip(0) * -1 - 1
+        self.pos_freqs = torch.cat(
+            [
+                self.rope_params(pos_index, self.axes_dim[0], self.theta),
+                self.rope_params(pos_index, self.axes_dim[1], self.theta),
+                self.rope_params(pos_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        self.neg_freqs = torch.cat(
+            [
+                self.rope_params(neg_index, self.axes_dim[0], self.theta),
+                self.rope_params(neg_index, self.axes_dim[1], self.theta),
+                self.rope_params(neg_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        self.rope_cache = {}
+        # DO NOT USING REGISTER BUFFER HERE, IT WILL CAUSE COMPLEX NUMBERS LOSE ITS IMAGINARY PART
+        self.scale_rope = scale_rope
+    def rope_params(self, index, dim, theta=10000):
+        """
+        Args:
+            index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
+        """
+        assert dim % 2 == 0
+        freqs = torch.outer(index, 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float32).div(dim)))
+        freqs = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs
+    def forward(self, video_fhw, txt_seq_lens, device):
+        """
+        Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
+        txt_length: [bs] a list of 1 integers representing the length of the text
+        """
+        if self.pos_freqs.device != device:
+            self.pos_freqs = self.pos_freqs.to(device)
+            self.neg_freqs = self.neg_freqs.to(device)
+        if isinstance(video_fhw, list):
+            video_fhw = video_fhw[0]
+        if not isinstance(video_fhw, list):
+            video_fhw = [video_fhw]
+        vid_freqs = []
+        max_vid_index = 0
+        for idx, fhw in enumerate(video_fhw):
+            frame, height, width = fhw
+            rope_key = f"{idx}_{height}_{width}"
+            if not torch.compiler.is_compiling():
+                if rope_key not in self.rope_cache:
+                    self.rope_cache[rope_key] = self._compute_video_freqs(frame, height, width, idx)
+                video_freq = self.rope_cache[rope_key]
+            else:
+                video_freq = self._compute_video_freqs(frame, height, width, idx)
+            video_freq = video_freq.to(device)
+            vid_freqs.append(video_freq)
+            if self.scale_rope:
+                max_vid_index = max(height // 2, width // 2, max_vid_index)
+            else:
+                max_vid_index = max(height, width, max_vid_index)
+        max_len = max(txt_seq_lens)
+        txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + max_len, ...]
+        vid_freqs = torch.cat(vid_freqs, dim=0)
+        return vid_freqs, txt_freqs
+    @functools.lru_cache(maxsize=None)
+    def _compute_video_freqs(self, frame, height, width, idx=0):
+        seq_lens = frame * height * width
+        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_frame = freqs_pos[0][idx : idx + frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+        if self.scale_rope:
+            freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+            freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+            freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+        else:
+            freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+            freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+        return freqs.clone().contiguous()
+class QwenDoubleStreamAttnProcessor2_0:
+    """
+    Attention processor for Qwen double-stream architecture, matching DoubleStreamLayerMegatron logic. This processor
+    implements joint attention computation where text and image streams are processed together.
+    """
+    _attention_backend = None
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "QwenDoubleStreamAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,  # Image stream
+        encoder_hidden_states: torch.FloatTensor = None,  # Text stream
+        encoder_hidden_states_mask: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if encoder_hidden_states is None:
+            raise ValueError("QwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)")
+        seq_txt = encoder_hidden_states.shape[1]
+        # Compute QKV for image stream (sample projections)
+        img_query = attn.to_q(hidden_states)
+        img_key = attn.to_k(hidden_states)
+        img_value = attn.to_v(hidden_states)
+        # Compute QKV for text stream (context projections)
+        txt_query = attn.add_q_proj(encoder_hidden_states)
+        txt_key = attn.add_k_proj(encoder_hidden_states)
+        txt_value = attn.add_v_proj(encoder_hidden_states)
+        # Reshape for multi-head attention
+        img_query = img_query.unflatten(-1, (attn.heads, -1))
+        img_key = img_key.unflatten(-1, (attn.heads, -1))
+        img_value = img_value.unflatten(-1, (attn.heads, -1))
+        txt_query = txt_query.unflatten(-1, (attn.heads, -1))
+        txt_key = txt_key.unflatten(-1, (attn.heads, -1))
+        txt_value = txt_value.unflatten(-1, (attn.heads, -1))
+        # Apply QK normalization
+        if attn.norm_q is not None:
+            img_query = attn.norm_q(img_query)
+        if attn.norm_k is not None:
+            img_key = attn.norm_k(img_key)
+        if attn.norm_added_q is not None:
+            txt_query = attn.norm_added_q(txt_query)
+        if attn.norm_added_k is not None:
+            txt_key = attn.norm_added_k(txt_key)
+        # Apply RoPE
+        if image_rotary_emb is not None:
+            img_freqs, txt_freqs = image_rotary_emb
+            img_query = apply_rotary_emb_qwen(img_query, img_freqs, use_real=False)
+            img_key = apply_rotary_emb_qwen(img_key, img_freqs, use_real=False)
+            txt_query = apply_rotary_emb_qwen(txt_query, txt_freqs, use_real=False)
+            txt_key = apply_rotary_emb_qwen(txt_key, txt_freqs, use_real=False)
+        # Concatenate for joint attention
+        # Order: [text, image]
+        joint_query = torch.cat([txt_query, img_query], dim=1)
+        joint_key = torch.cat([txt_key, img_key], dim=1)
+        joint_value = torch.cat([txt_value, img_value], dim=1)
+        # Compute joint attention
+        joint_hidden_states = dispatch_attention_fn(
+            joint_query,
+            joint_key,
+            joint_value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+        )
+        # Reshape back
+        joint_hidden_states = joint_hidden_states.flatten(2, 3)
+        joint_hidden_states = joint_hidden_states.to(joint_query.dtype)
+        # Split attention outputs back
+        txt_attn_output = joint_hidden_states[:, :seq_txt, :]  # Text part
+        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part
+        # Apply output projections
+        img_attn_output = attn.to_out[0](img_attn_output)
+        if len(attn.to_out) > 1:
+            img_attn_output = attn.to_out[1](img_attn_output)  # dropout
+        txt_attn_output = attn.to_add_out(txt_attn_output)
+        return img_attn_output, txt_attn_output
+@maybe_allow_in_graph
+class QwenImageTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        # Image processing modules
+        self.img_mod = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(dim, 6 * dim, bias=True),  # For scale, shift, gate for norm1 and norm2
+        )
+        self.img_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,  # Enable cross attention for joint computation
+            added_kv_proj_dim=dim,  # Enable added KV projections for text stream
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=QwenDoubleStreamAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.img_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.img_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # Text processing modules
+        self.txt_mod = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(dim, 6 * dim, bias=True),  # For scale, shift, gate for norm1 and norm2
+        )
+        self.txt_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        # Text doesn't need separate attention - it's handled by img_attn joint computation
+        self.txt_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.txt_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+    def _modulate(self, x, mod_params):
+        """Apply modulation to input tensor"""
+        shift, scale, gate = mod_params.chunk(3, dim=-1)
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_mask: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # Get modulation parameters for both streams
+        img_mod_params = self.img_mod(temb)  # [B, 6*dim]
+        txt_mod_params = self.txt_mod(temb)  # [B, 6*dim]
+        # Split modulation parameters for norm1 and norm2
+        img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
+        txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
+        # Process image stream - norm1 + modulation
+        img_normed = self.img_norm1(hidden_states)
+        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
+        # Process text stream - norm1 + modulation
+        txt_normed = self.txt_norm1(encoder_hidden_states)
+        txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1)
+        # Use QwenAttnProcessor2_0 for joint attention computation
+        # This directly implements the DoubleStreamLayerMegatron logic:
+        # 1. Computes QKV for both streams
+        # 2. Applies QK normalization and RoPE
+        # 3. Concatenates and runs joint attention
+        # 4. Splits results back to separate streams
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=img_modulated,  # Image stream (will be processed as "sample")
+            encoder_hidden_states=txt_modulated,  # Text stream (will be processed as "context")
+            encoder_hidden_states_mask=encoder_hidden_states_mask,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        # QwenAttnProcessor2_0 returns (img_output, txt_output) when encoder_hidden_states is provided
+        img_attn_output, txt_attn_output = attn_output
+        # Apply attention gates and add residual (like in Megatron)
+        hidden_states = hidden_states + img_gate1 * img_attn_output
+        encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
+        # Process image stream - norm2 + MLP
+        img_normed2 = self.img_norm2(hidden_states)
+        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
+        img_mlp_output = self.img_mlp(img_modulated2)
+        hidden_states = hidden_states + img_gate2 * img_mlp_output
+        # Process text stream - norm2 + MLP
+        txt_normed2 = self.txt_norm2(encoder_hidden_states)
+        txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2)
+        txt_mlp_output = self.txt_mlp(txt_modulated2)
+        encoder_hidden_states = encoder_hidden_states + txt_gate2 * txt_mlp_output
+        # Clip to prevent overflow for fp16
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states
+class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin, AttentionMixin):
+    """
+    The Transformer model introduced in Qwen.
+    Args:
+        patch_size (`int`, defaults to `2`):
+            Patch size to turn the input data into small patches.
+        in_channels (`int`, defaults to `64`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `None`):
+            The number of channels in the output. If not specified, it defaults to `in_channels`.
+        num_layers (`int`, defaults to `60`):
+            The number of layers of dual stream DiT blocks to use.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of dimensions to use for each attention head.
+        num_attention_heads (`int`, defaults to `24`):
+            The number of attention heads to use.
+        joint_attention_dim (`int`, defaults to `3584`):
+            The number of dimensions to use for the joint attention (embedding/channel dimension of
+            `encoder_hidden_states`).
+        guidance_embeds (`bool`, defaults to `False`):
+            Whether to use guidance embeddings for guidance-distilled variant of the model.
+        axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
+            The dimensions to use for the rotary positional embeddings.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["QwenImageTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["QwenImageTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 64,
+        out_channels: Optional[int] = 16,
+        num_layers: int = 60,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 3584,
+        guidance_embeds: bool = False,  # TODO: this should probably be removed
+        axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
+        self.time_text_embed = QwenTimestepProjEmbeddings(embedding_dim=self.inner_dim)
+        self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
+        self.img_in = nn.Linear(in_channels, self.inner_dim)
+        self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                QwenImageTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        encoder_hidden_states_mask: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        guidance: torch.Tensor = None,  # TODO: this should probably be removed
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        """
+        The [`QwenTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`):
+                Mask of the input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.img_in(hidden_states)
+        timestep = timestep.to(hidden_states.dtype)
+        encoder_hidden_states = self.txt_norm(encoder_hidden_states)
+        encoder_hidden_states = self.txt_in(encoder_hidden_states)
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        temb = (
+            self.time_text_embed(timestep, hidden_states)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, hidden_states)
+        )
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_hidden_states_mask,
+                    temb,
+                    image_rotary_emb,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_hidden_states_mask=encoder_hidden_states_mask,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=attention_kwargs,
+                )
+        # Use only the image part (hidden_states) from the dual-stream blocks
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+git+https://github.com/huggingface/transformers.git@v4.57.3
+git+https://github.com/huggingface/accelerate.git
+git+https://github.com/huggingface/diffusers.git
+git+https://github.com/huggingface/peft.git
+huggingface_hub
+sentencepiece
+torchvision
+supervision
+kernels
+spaces
+hf_xet
+torch==2.9.1
+numpy
+av

setup_manager.py ADDED Viewed

	@@ -0,0 +1,462 @@

+import os
+import subprocess
+import sys
+# Configuration
+WORKSPACE_DIR = "/workspace"
+VENV_DIR = os.path.join(WORKSPACE_DIR, "venv")
+APPS_DIR = os.path.join(WORKSPACE_DIR, "apps")
+REPO_DIR = os.path.join(WORKSPACE_DIR, "Qwen-Image-Edit")
+HF_TOKEN = "YOUR_HF_TOKEN_HERE"
+# Cache and Temp Directories (Strictly on persistent drive)
+CACHE_BASE = os.path.join(WORKSPACE_DIR, "cache")
+TMP_DIR = os.path.join(WORKSPACE_DIR, "tmp")
+PIP_CACHE = os.path.join(CACHE_BASE, "pip")
+HF_HOME = os.path.join(CACHE_BASE, "huggingface")
+def ensure_dirs():
+    """Ensures all necessary persistent directories exist."""
+    dirs = [APPS_DIR, REPO_DIR, CACHE_BASE, TMP_DIR, PIP_CACHE, HF_HOME]
+    for d in dirs:
+        if not os.path.exists(d):
+            os.makedirs(d)
+            print(f"Created directory: {d}")
+def run_command(command, cwd=None, env=None):
+    """Runs a shell command and prints output."""
+    print(f"Running: {command}")
+    current_env = os.environ.copy()
+    # Force use of persistent directories
+    current_env["TMPDIR"] = TMP_DIR
+    current_env["PIP_CACHE_DIR"] = PIP_CACHE
+    current_env["HF_HOME"] = HF_HOME
+    if env:
+        current_env.update(env)
+    process = subprocess.Popen(
+        command,
+        shell=True,
+        stdout=subprocess.PIPE,
+        stderr=subprocess.STDOUT,
+        text=True,
+        cwd=cwd,
+        env=current_env
+    )
+    for line in process.stdout:
+        print(line, end="")
+    process.wait()
+    if process.returncode != 0:
+        print(f"Command failed with return code {process.returncode}")
+    return process.returncode
+def setup_venv():
+    """Sets up a persistent virtual environment in /workspace."""
+    if not os.path.exists(VENV_DIR):
+        print(f"Creating virtual environment in {VENV_DIR}...")
+        run_command(f"python3 -m venv {VENV_DIR}")
+    else:
+        print("Virtual environment already exists.")
+def install_package(package_name):
+    """Installs a pip package into the persistent venv."""
+    pip_path = os.path.join(VENV_DIR, "bin", "pip")
+    run_command(f"{pip_path} install {package_name}")
+def install_git_xet():
+    """Installs git-xet using the huggingface script."""
+    print("Installing git-xet...")
+    run_command("curl -LsSf https://huggingface.co/install-git-xet.sh | bash")
+    run_command("git xet install")
+def install_hf_cli():
+    """Installs Hugging Face CLI."""
+    print("Installing Hugging Face CLI...")
+    run_command("curl -LsSf https://hf.co/cli/install.sh | bash")
+def download_space():
+    """Downloads the Qwen Space using hf cli."""
+    if not os.path.exists(REPO_DIR):
+        os.makedirs(REPO_DIR)
+    print(f"Downloading Space to {REPO_DIR}...")
+    # Using full path to hf if it's in ~/.local/bin
+    hf_path = os.path.expanduser("~/.local/bin/hf")
+    if not os.path.exists(hf_path):
+        hf_path = "hf" # fallback to PATH
+    env = {"HF_TOKEN": HF_TOKEN}
+    run_command(f"{hf_path} download Pr0f3ssi0n4ln00b/Qwen-Image-Edit-Rapid-AIO-Loras-Experimental --repo-type=space --local-dir {REPO_DIR}", env=env)
+def create_app_file(filename, content):
+    """Creates/Updates a file in the apps directory."""
+    if not os.path.exists(APPS_DIR):
+        os.makedirs(APPS_DIR)
+    filepath = os.path.join(APPS_DIR, filename)
+    with open(filepath, "w") as f:
+        f.write(content)
+    print(f"Created/Updated: {filepath}")
+def patch_app():
+    """Patches app.py to optimize for VRAM and fix OOM issues."""
+    app_path = os.path.join(REPO_DIR, "app.py")
+    if not os.path.exists(app_path):
+        print(f"Warning: {app_path} not found, cannot patch.")
+        return
+    print("Patching app.py for memory optimization...")
+    with open(app_path, "r") as f:
+        content = f.read()
+    # 1. Update transformer loading to use device_map="auto" and low_cpu_mem_usage
+    content = content.replace(
+        'device_map="cuda",',
+        'device_map="auto",\n            low_cpu_mem_usage=True,'
+    )
+    # 2. Remove redundant .to(device) which causes OOM
+    content = content.replace(').to(device)', ')')
+    # 3. Enable model CPU offload to save VRAM
+    if "p.enable_model_cpu_offload()" not in content:
+        content = content.replace(
+            'return p',
+            'p.enable_model_cpu_offload()\n    return p'
+        )
+    # 4. Disable FA3 Processor (to avoid hangs/compilation issues)
+    content = content.replace(
+        'pipe.transformer.set_attn_processor(QwenDoubleStreamAttnProcessorFA3())',
+        'print("Skipping FA3 optimization for stability.")'
+    )
+    # 5. Fix launch parameters for visibility and accessibility
+    content = content.replace(
+        'demo.queue(max_size=30).launch(',
+        'demo.queue(max_size=30).launch(server_name="0.0.0.0", share=True, '
+    )
+    # 6. Ensure spaces.GPU is handled (if it blocks)
+    # Usually it's fine, but let's be safe and mock it if env isn't right
+    if 'import spaces' in content and 'class spaces:' not in content:
+        content = 'import sys\ntry:\n    import spaces\nexcept ImportError:\n    class spaces:\n        @staticmethod\n        def GPU(f): return f\nsys.modules["spaces"] = sys.modules.get("spaces", spaces)\n' + content
+    # 7. Add missing LORA_PRESET_PROMPTS (Robust append)
+    additional_prompts_map = {
+        "Consistance": "improve consistency and quality of the generated image",
+        "F2P": "transform the image into a high-quality photo with realistic details",
+        "Multiple-Angles": "change the camera angle of the image",
+        "Light-Restoration": "Remove shadows and relight the image using soft lighting",
+        "Relight": "Relight the image with cinematic lighting",
+        "Multi-Angle-Lighting": "Change the lighting direction and intensity",
+        "Edit-Skin": "Enhance skin textures and natural details",
+        "Next-Scene": "Generate the next scene based on the current image",
+        "Flat-Log": "Desaturate and lower contrast for a flat log look",
+        "Upscale-Image": "Enhance and sharpen the image details",
+        "BFS-Best-FaceSwap": "head_swap : start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure, mouth, lips and front head of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
+        "BFS-Best-FaceSwap-merge": "head_swap : start with Picture 1 as the base image, keeping its lighting, environment, and background. remove the head from Picture 1 completely and replace it with the head from Picture 2, strictly preserving the hair, eye color, and nose structure, mouth, lips and front head of Picture 2. copy the eye direction, head rotation, and micro-expressions from Picture 1. high quality, sharp details, 4k",
+        "Qwen-lora-nsfw": "Convert this picture to artistic style.", # Default prompt
+    }
+    # 9. Add new LoRA to ADAPTER_SPECS
+    new_lora_config = """
+    "Qwen-lora-nsfw": {
+        "type": "single",
+        "repo": "wiikoo/Qwen-lora-nsfw",
+        "weights": "loras/qwen_image_edit_remove-clothing_v1.0.safetensors",
+        "adapter_name": "qwen-lora-nsfw",
+        "strength": 1.0,
+    },
+"""
+    if '"Qwen-lora-nsfw":' not in content:
+        content = content.replace(
+            'ADAPTER_SPECS = {',
+            'ADAPTER_SPECS = {' + new_lora_config
+        )
+    if "Manual Patch for missing prompts" not in content:
+        content += "\n\n# Manual Patch for missing prompts\ntry:\n    LORA_PRESET_PROMPTS.update({\n"
+        for key, val in additional_prompts_map.items():
+            content += f'        "{key}": "{val}",\n'
+        content += "    })\nexcept NameError:\n    pass\n"
+    # 8. Modify on_lora_change_ui to ALWAYS update the prompt if a style is picked
+    # (or at least be more aggressive)
+    new_ui_logic = """
+def on_lora_change_ui(selected_lora, current_prompt, extras_condition_only):
+    # Always provide the preset if selected
+    prompt_val = current_prompt
+    if selected_lora != NONE_LORA:
+        preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
+        if preset:
+             prompt_val = preset
+    prompt_update = gr.update(value=prompt_val)
+"""
+    # Find the old function and replace it
+    start_marker = "def on_lora_change_ui"
+    end_marker = "return prompt_update, img2_update, extras_update"
+    if start_marker in content and end_marker in content:
+        import re
+        content = re.sub(
+            r"def on_lora_change_ui\(.*?\):.*?return prompt_update, img2_update, extras_update",
+            new_ui_logic + "\n    # Image2 visibility/label\n    if lora_requires_two_images(selected_lora):\n        img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))\n    else:\n        img2_update = gr.update(visible=False, value=None, label='Upload Reference (Image 2)')\n\n    # Extra references routing default\n    if selected_lora in ('BFS-Best-FaceSwap', 'BFS-Best-FaceSwap-merge', 'AnyPose'):\n        extras_update = gr.update(value=True)\n    else:\n        extras_update = gr.update(value=extras_condition_only)\n\n    return prompt_update, img2_update, extras_update",
+            content,
+            flags=re.DOTALL
+        )
+    with open(app_path, "w") as f:
+        f.write(content)
+    # --- NEW UI PATCHES ---
+    with open(app_path, "r") as f:
+        content = f.read()
+    # 10. Implement missing _append_to_gallery function
+    append_fn = """
+def _append_to_gallery(existing_gallery, new_image):
+    if existing_gallery is None:
+        return [new_image]
+    if not isinstance(existing_gallery, list):
+        existing_gallery = [existing_gallery]
+    existing_gallery.append(new_image)
+    return existing_gallery
+"""
+    if "def _append_to_gallery" not in content:
+        content = content.replace(
+            '# UI helpers: output routing + derived conditioning',
+            '# UI helpers: output routing + derived conditioning\n' + append_fn
+        )
+    # 11. Remove height constraints from main image components
+    content = content.replace('height=290)', ')')
+    content = content.replace('height=350)', ')')
+    # 12. Strip out gr.Examples block to declutter UI
+    # We find the start of gr.Examples and the end of its call
+    if "gr.Examples(" in content:
+        import re
+        content = re.sub(
+            r"gr\.Examples\([\s\S]*?label=\"Examples\"[\s\S]*?\)",
+            "# Examples removed automatically by setup_manager",
+            content
+        )
+    with open(app_path, "w") as f:
+        f.write(content)
+    # --- END NEW UI PATCHES ---
+    # --- 3D CAMERA AND PROMPT CLEARING PATCHES ---
+    with open(app_path, "r") as f:
+        content = f.read()
+    # Import the custom 3D Camera control safely at the top
+    if "update_prompt_with_camera" not in content:
+        content = content.replace("import os", "import os\nfrom camera_control_ui import CameraControl3D, build_camera_prompt, update_prompt_with_camera")
+    # Add the 3D Camera LoRA to ADAPTER_SPECS
+    camera_lora_config = """
+    "3D-Camera": {
+        "type": "single",
+        "repo": "fal/Qwen-Image-Edit-2511-Multiple-Angles-LoRA",
+        "weights": "qwen-image-edit-2511-multiple-angles-lora.safetensors",
+        "adapter_name": "angles",
+        "strength": 1.0,
+    },
+"""
+    if '"3D-Camera":' not in content:
+        content = content.replace(
+            'ADAPTER_SPECS = {',
+            'ADAPTER_SPECS = {' + camera_lora_config
+        )
+    # Patch on_lora_change_ui to clear prompt if no preset exists and toggle 3D camera visibility
+    prompt_clear_logic = """
+def on_lora_change_ui(selected_lora, current_prompt, extras_condition_only):
+    prompt_val = current_prompt
+    if selected_lora != NONE_LORA:
+        preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
+        if preset:
+             prompt_val = preset
+        else:
+             prompt_val = "" # CLEAR THE PROMPT IF ACTIVE BUT NO PRESET
+    prompt_update = gr.update(value=prompt_val)
+    camera_update = gr.update(visible=(selected_lora == "3D-Camera"))
+    # Image2 visibility/label
+    if lora_requires_two_images(selected_lora):
+        img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))
+    else:
+        img2_update = gr.update(visible=False, value=None, label='Upload Reference (Image 2)')
+    # Extra references routing default
+    if selected_lora in ('BFS-Best-FaceSwap', 'BFS-Best-FaceSwap-merge', 'AnyPose'):
+        extras_update = gr.update(value=True)
+    else:
+        extras_update = gr.update(value=extras_condition_only)
+    return prompt_update, img2_update, extras_update, camera_update
+"""
+    old_on_lora = """
+def on_lora_change_ui(selected_lora, current_prompt, extras_condition_only):
+    # Always provide the preset if selected
+    prompt_val = current_prompt
+    if selected_lora != NONE_LORA:
+        preset = LORA_PRESET_PROMPTS.get(selected_lora, "")
+        if preset:
+             prompt_val = preset
+    prompt_update = gr.update(value=prompt_val)
+    # Image2 visibility/label
+    if lora_requires_two_images(selected_lora):
+        img2_update = gr.update(visible=True, label=image2_label_for_lora(selected_lora))
+    else:
+        img2_update = gr.update(visible=False, value=None, label='Upload Reference (Image 2)')
+    # Extra references routing default
+    if selected_lora in ('BFS-Best-FaceSwap', 'BFS-Best-FaceSwap-merge', 'AnyPose'):
+        extras_update = gr.update(value=True)
+    else:
+        extras_update = gr.update(value=extras_condition_only)
+    return prompt_update, img2_update, extras_update
+"""
+    if "camera_update = gr.update(visible" not in content:
+        content = content.replace(old_on_lora.strip(), prompt_clear_logic.strip())
+        # We also need to update the caller
+        content = content.replace(
+            "outputs=[prompt, input_image_2, extras_condition_only],",
+            "outputs=[prompt, input_image_2, extras_condition_only, camera_container],"
+        )
+    # Inject the 3D Camera UI Block right below input_image_2 definition
+    camera_ui_block = """
+                input_image_2 = gr.Image(label="Upload Reference (Image 2)", type="pil", height=290, visible=False)
+                with gr.Column(visible=False) as camera_container:
+                    gr.Markdown("### 🎮 3D Camera Control\\n*Drag handles: 🟢 Azimuth, 🩷 Elevation, 🟠 Distance*")
+                    camera_3d = CameraControl3D(value={"azimuth": 0, "elevation": 0, "distance": 1.0}, elem_id="camera-3d-control")
+                    gr.Markdown("### 🎚️ Slider Controls")
+                    azimuth_slider = gr.Slider(label="Azimuth", minimum=0, maximum=315, step=45, value=0, info="0°=front, 90°=right, 180°=back, 270°=left")
+                    elevation_slider = gr.Slider(label="Elevation", minimum=-30, maximum=60, step=30, value=0, info="-30°=low angle, 0°=eye, 60°=high angle")
+                    distance_slider = gr.Slider(label="Distance", minimum=0.6, maximum=1.4, step=0.4, value=1.0, info="0.6=close, 1.0=medium, 1.4=wide")
+"""
+    if "camera_container:" not in content:
+        content = content.replace(
+            '                input_image_2 = gr.Image(label="Upload Reference (Image 2)", type="pil", height=290, visible=False)',
+            camera_ui_block.strip("\\n")
+        )
+    # Inject the Events. We place them right before "run_button.click("
+    camera_events = """
+    # --- 3D Camera Events ---
+    def update_prompt_from_sliders(az, el, dist, curr_prompt):
+        return update_prompt_with_camera(az, el, dist, curr_prompt)
+    def sync_3d_to_sliders(cv, curr_prompt):
+        if cv and isinstance(cv, dict):
+            az = cv.get('azimuth', 0)
+            el = cv.get('elevation', 0)
+            dist = cv.get('distance', 1.0)
+            return az, el, dist, update_prompt_with_camera(az, el, dist, curr_prompt)
+        return gr.update(), gr.update(), gr.update(), gr.update()
+    def sync_sliders_to_3d(az, el, dist):
+        return {"azimuth": az, "elevation": el, "distance": dist}
+    def update_3d_image(img):
+        if img is None: return gr.update(imageUrl=None)
+        import base64
+        from io import BytesIO
+        buf = BytesIO()
+        img.save(buf, format="PNG")
+        durl = f"data:image/png;base64,{base64.b64encode(buf.getvalue()).decode()}"
+        return gr.update(imageUrl=durl)
+    for slider in [azimuth_slider, elevation_slider, distance_slider]:
+        slider.change(fn=update_prompt_from_sliders, inputs=[azimuth_slider, elevation_slider, distance_slider, prompt], outputs=[prompt])
+        slider.release(fn=sync_sliders_to_3d, inputs=[azimuth_slider, elevation_slider, distance_slider], outputs=[camera_3d])
+    camera_3d.change(fn=sync_3d_to_sliders, inputs=[camera_3d, prompt], outputs=[azimuth_slider, elevation_slider, distance_slider, prompt])
+    input_image_1.upload(fn=update_3d_image, inputs=[input_image_1], outputs=[camera_3d])
+    input_image_1.clear(fn=lambda: gr.update(imageUrl=None), outputs=[camera_3d])
+    run_button.click(
+"""
+    if "def sync_3d_to_sliders" not in content:
+        content = content.replace("    run_button.click(\n", camera_events)
+    # Clear any bad \\n literals if they exist
+    content = content.replace("\\n    demo.queue", "\n    demo.queue")
+    if "head=" not in content:
+        content = content.replace(
+            "demo.queue(max_size=30).launch(",
+            """head = '<script src="https://cdnjs.cloudflare.com/ajax/libs/three.js/r128/three.min.js"></script>'
+    demo.queue(max_size=30).launch(head=head, """
+        )
+    with open(app_path, "w") as f:
+        f.write(content)
+    # --- END 3D CAMERA PATCHES ---
+    print("Successfully patched app.py.")
+def install_dependencies():
+    """Installs dependencies from requirements.txt into the persistent venv."""
+    pip_path = os.path.join(VENV_DIR, "bin", "pip")
+    requirements_path = os.path.join(REPO_DIR, "requirements.txt")
+    if os.path.exists(requirements_path):
+        print("Installing dependencies from requirements.txt...")
+        # Note: torch 2.9.1 might not exist on PyPI, checking if it needs --extra-index-url
+        # For L40S, we typically want the latest stable torch with CUDA 12.x
+        run_command(f"{pip_path} install -r {requirements_path}")
+    else:
+        print(f"No requirements.txt found in {REPO_DIR}")
+def run_app():
+    """Starts the Gradio app."""
+    python_path = os.path.join(VENV_DIR, "bin", "python")
+    app_path = os.path.join(REPO_DIR, "app.py")
+    if os.path.exists(app_path):
+        print(f"Starting app: {app_path}")
+        # Gradio apps often need to be bound to 0.0.0.0 for external access
+        # We'll run it and see if it requires specific environment variables
+        env = {"PYTHONPATH": REPO_DIR}
+        run_command(f"{python_path} {app_path}", cwd=REPO_DIR, env=env)
+    else:
+        print(f"App file not found: {app_path}")
+def main():
+    # Ensure workspace exists
+    if not os.path.exists(WORKSPACE_DIR):
+        print(f"Error: {WORKSPACE_DIR} not found. Ensure this is a RunPod with persistent storage.")
+        return
+    ensure_dirs()
+    setup_venv()
+    install_git_xet()
+    install_hf_cli()
+    download_space()
+    patch_app()
+    install_dependencies()
+    # We don't call run_app here by default to allow script updates
+    print("Setup tasks completed. Run with 'run' argument to start the app.")
+if __name__ == "__main__":
+    if len(sys.argv) > 1 and sys.argv[1] == "run":
+        run_app()
+    else:
+        main()

start_app.sh ADDED Viewed

	@@ -0,0 +1,7 @@

+#!/bin/bash
+export PYTHONPATH=/workspace/Qwen-Image-Edit
+export TMPDIR=/workspace/tmp
+export HF_HOME=/workspace/cache/huggingface
+export PYTHONUNBUFFERED=1
+cd /workspace/Qwen-Image-Edit
+exec /workspace/venv/bin/python -u /workspace/Qwen-Image-Edit/app.py