Spaces:

YUXU915
/

TAG-MoE

Sleeping

App Files Files Community

YUXU915

Yana-Hangabina commited on Apr 4

Commit

201d119

1 Parent(s): 15ebffa

Release online demo (#1)

Browse files

- Add Space-only TAG-MoE demo files (5fd79e9e5e8c19cf4331bd7032513e54449736dd)
- Remove cached pyc from Space branch (e7ff53c37003024226f12a82aae43b181739dc64)
- Add Space README metadata header (6428707556418f3ebf89879804f2e61e892cf6f1)

Co-authored-by: Yana-Hangabina <Yana-Hangabina@users.noreply.huggingface.co>

Files changed (9) hide show

README.md +6 -5
app.py +315 -0
requirements.txt +17 -0
src/infer_tagmoe.py +325 -0
src/models/transformer_qwenimage_tagmoe.py +761 -0
src/pipelines/pipeline_qwenimage_tagmoe.py +1068 -0
src/utils/__init__.py +31 -0
src/utils/device_utils.py +80 -0
src/utils/inference_config.py +19 -0

README.md CHANGED Viewed

@@ -1,14 +1,15 @@
 ---
-title: TAG MoE
-emoji: 📚
 colorFrom: red
 colorTo: gray
 sdk: gradio
-sdk_version: 6.1.0
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: TAG-MoE：Task-Aware Gating for Unified Generative Mixture-of-
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: TAG-MoE
+emoji: 🎨
 colorFrom: red
 colorTo: gray
 sdk: gradio
+sdk_version: 5.49.1
+python_version: 3.10
 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: Task-Aware Gating for Unified Generative Mixture-of-Experts
 ---
+TAG-MoE Space demo.

app.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import os
+import threading
+import gradio as gr
+from src.utils.device_utils import resolve_device_ids
+from src.utils.inference_config import (
+    DEFAULT_HEIGHT,
+    DEFAULT_NEGATIVE_PROMPT,
+    DEFAULT_NUM_INFERENCE_STEPS,
+    DEFAULT_SEED,
+    DEFAULT_TRUE_CFG_SCALE,
+    DEFAULT_WIDTH,
+    generate_random_seed,
+)
+try:
+    import spaces
+except ImportError:
+    spaces = None
+def _env_bool(name: str, default: bool = False) -> bool:
+    value = os.getenv(name)
+    if value is None:
+        return default
+    return value.strip().lower() in {"1", "true", "yes", "on"}
+def _env_int(name: str, default: int) -> int:
+    value = os.getenv(name)
+    if value is None or not value.strip():
+        return default
+    return int(value.strip())
+PRETRAINED_MODEL_PATH = os.getenv("PRETRAINED_MODEL_PATH", "Qwen/Qwen-Image")
+TRANSFORMER_MODEL_PATH = os.getenv("TRANSFORMER_MODEL_PATH", "YUXU915/TAG-MoE")
+TRANSFORMER_WEIGHT_NAME = os.getenv("TRANSFORMER_WEIGHT_NAME", "diffusion_pytorch_model.safetensors")
+TRANSFORMER_SUBFOLDER = os.getenv("TRANSFORMER_SUBFOLDER", "transformer")
+TRANSFORMER_REVISION = os.getenv("TRANSFORMER_REVISION", "").strip() or None
+LOCAL_FILES_ONLY = _env_bool("LOCAL_FILES_ONLY", default=False)
+TAGMOE_DEVICE = os.getenv("TAGMOE_DEVICE", "auto").strip().lower()
+ZERO_GPU_DURATION = _env_int("ZERO_GPU_DURATION", default=300)
+LINKS_HTML = """
+<div class="tagmoe-links">
+  <a href="https://yuci-gpt.github.io/TAG-MoE/" target="_blank" rel="noopener noreferrer">Project Homepage</a>
+  <a href="https://arxiv.org/abs/2601.08881" target="_blank" rel="noopener noreferrer">Paper (arXiv)</a>
+  <a href="https://github.com/ICTMCG/TAG-MoE" target="_blank" rel="noopener noreferrer">GitHub Repo</a>
+  <a href="https://huggingface.co/YUXU915/TAG-MoE" target="_blank" rel="noopener noreferrer">Model Weights</a>
+</div>
+"""
+_RUNTIME_LOCK = threading.Lock()
+_PIPELINE = None
+_BASE64_TO_IMAGE_FN = None
+def _resolve_runtime_device_ids():
+    if TAGMOE_DEVICE in {"", "auto", "default"}:
+        import torch
+        return [0] if torch.cuda.is_available() else []
+    if TAGMOE_DEVICE in {"none", "framework"}:
+        return None
+    return resolve_device_ids(TAGMOE_DEVICE)
+def _ensure_runtime_loaded():
+    global _PIPELINE, _BASE64_TO_IMAGE_FN
+    if _PIPELINE is not None and _BASE64_TO_IMAGE_FN is not None:
+        return _PIPELINE, _BASE64_TO_IMAGE_FN
+    with _RUNTIME_LOCK:
+        if _PIPELINE is not None and _BASE64_TO_IMAGE_FN is not None:
+            return _PIPELINE, _BASE64_TO_IMAGE_FN
+        from src.infer_tagmoe import End2End, base64_to_image
+        device_ids = _resolve_runtime_device_ids()
+        _PIPELINE = End2End(
+            pretrained_model_path=PRETRAINED_MODEL_PATH,
+            transformer_model_path=TRANSFORMER_MODEL_PATH,
+            device_ids=device_ids,
+            transformer_weight_name=TRANSFORMER_WEIGHT_NAME,
+            transformer_subfolder=TRANSFORMER_SUBFOLDER,
+            transformer_revision=TRANSFORMER_REVISION,
+            local_files_only=LOCAL_FILES_ONLY,
+        )
+        _BASE64_TO_IMAGE_FN = base64_to_image
+        return _PIPELINE, _BASE64_TO_IMAGE_FN
+class LazyPipelineProxy:
+    def predict(self, input_dict):
+        pipeline, _ = _ensure_runtime_loaded()
+        return pipeline.predict(input_dict)
+def _lazy_base64_to_image(data):
+    _, base64_to_image_fn = _ensure_runtime_loaded()
+    return base64_to_image_fn(data)
+def _infer_decorator():
+    if spaces is None:
+        return lambda fn: fn
+    return spaces.GPU(duration=ZERO_GPU_DURATION)
+def build_demo(gr, pipeline, base64_to_image_fn):
+    def infer(
+        image,
+        prompt,
+        negative_prompt,
+        seed,
+        gen_width,
+        gen_height,
+        cfg_scale,
+        inference_steps,
+    ):
+        if prompt is None or not str(prompt).strip():
+            raise gr.Error("Prompt cannot be empty.")
+        if image is None:
+            raise gr.Error("Image is required.")
+        width_value = int(gen_width) if gen_width is not None else int(image.size[0])
+        height_value = int(gen_height) if gen_height is not None else int(image.size[1])
+        input_dict = {
+            "image": image.convert("RGB"),
+            "prompt": str(prompt).strip(),
+            "negative_prompt": str(negative_prompt or DEFAULT_NEGATIVE_PROMPT),
+            "seed": int(seed if seed is not None else DEFAULT_SEED),
+            "target_width": width_value,
+            "target_height": height_value,
+            "true_cfg_scale": float(cfg_scale),
+            "num_inference_steps": int(inference_steps),
+            "keep_original_size": False,
+        }
+        result = pipeline.predict(input_dict)
+        out_image = base64_to_image_fn(result["generate_imgs_buffer"][0])
+        return out_image, int(result["seed"])
+    def randomize_seed():
+        return generate_random_seed()
+    def on_image_upload(image):
+        if image is None:
+            return gr.update(), gr.update()
+        return int(image.size[0]), int(image.size[1])
+    title_html = """
+    <div class="tagmoe-header">
+      <picture>
+        <source srcset="https://raw.githubusercontent.com/yuci-gpt/TAG-MoE/refs/heads/master/static/images/logo_dark.png" media="(prefers-color-scheme: dark)">
+        <img src="https://raw.githubusercontent.com/yuci-gpt/TAG-MoE/refs/heads/master/static/images/logo_light.png" alt="TAG-MoE logo">
+      </picture>
+      <div>
+        <h1>TAG-MoE</h1>
+        <p>Task-Aware Gating for Unified Generative Mixture-of-Experts</p>
+      </div>
+    </div>
+    """
+    custom_css = """
+    .tagmoe-header {
+        display: flex;
+        align-items: center;
+        gap: 12px;
+        margin-bottom: 8px;
+    }
+    .tagmoe-header img {
+        width: 48px;
+        height: 48px;
+        object-fit: contain;
+    }
+    .tagmoe-header h1 {
+        margin: 0;
+        font-size: 1.8rem;
+    }
+    .tagmoe-header p {
+        margin: 0;
+        opacity: 0.85;
+        font-size: 0.95rem;
+    }
+    .param-card {
+        border: 1px solid var(--border-color-primary);
+        border-radius: 12px;
+        padding: 14px 14px 10px;
+        margin-bottom: 10px;
+    }
+    .param-card .gradio-textbox textarea {
+        min-height: 110px !important;
+    }
+    .run-btn button {
+        height: 46px !important;
+        font-weight: 600;
+    }
+    .image-panel {
+        border: 1px solid var(--border-color-primary);
+        border-radius: 12px;
+        padding: 10px;
+    }
+    .tool-btn {
+        margin-top: 28px !important;
+        min-width: 42px !important;
+        height: 42px !important;
+        padding: 0 !important;
+        display: flex;
+        align-items: center;
+        justify-content: center;
+        flex-shrink: 0;
+    }
+    .tagmoe-links {
+        margin: 6px 0 14px 0;
+        display: flex;
+        flex-wrap: wrap;
+        gap: 12px;
+        font-size: 0.95rem;
+    }
+    .tagmoe-links a {
+        text-decoration: none;
+    }
+    """
+    infer_fn = _infer_decorator()(infer)
+    with gr.Blocks(title="TAG-MoE Space Demo", css=custom_css) as demo:
+        gr.HTML(title_html)
+        gr.HTML(LINKS_HTML)
+        with gr.Row(equal_height=True):
+            with gr.Column(scale=1, elem_classes=["image-panel"]):
+                image_input = gr.Image(type="pil", label="Input Image", height=520)
+            with gr.Column(scale=1, elem_classes=["image-panel"]):
+                image_output = gr.Image(type="pil", label="Output Image", height=520)
+        with gr.Group(elem_classes=["param-card"]):
+            prompt_input = gr.Textbox(
+                label="Prompt",
+                placeholder="Describe the instruction",
+                lines=3,
+            )
+            negative_prompt_input = gr.Textbox(
+                label="Negative Prompt",
+                value=DEFAULT_NEGATIVE_PROMPT,
+                lines=2,
+                placeholder="Optional negative prompt",
+            )
+            with gr.Row():
+                gen_width_input = gr.Slider(minimum=64, maximum=4096, step=1, value=DEFAULT_WIDTH, label="Width")
+                gen_height_input = gr.Slider(minimum=64, maximum=4096, step=1, value=DEFAULT_HEIGHT, label="Height")
+            with gr.Row():
+                cfg_scale_input = gr.Slider(
+                    minimum=1.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=DEFAULT_TRUE_CFG_SCALE,
+                    label="CFG Scale",
+                )
+                inference_steps_input = gr.Slider(
+                    minimum=10,
+                    maximum=100,
+                    step=1,
+                    value=DEFAULT_NUM_INFERENCE_STEPS,
+                    label="Inference Steps",
+                )
+                with gr.Column(scale=1, min_width=200):
+                    with gr.Row():
+                        seed_input = gr.Number(
+                            label="Seed",
+                            value=generate_random_seed(),
+                            precision=0,
+                            scale=1,
+                        )
+                        random_seed_btn = gr.Button(
+                            "🎲",
+                            elem_classes=["tool-btn"],
+                            scale=0,
+                            min_width=42,
+                            variant="secondary",
+                        )
+            run_btn = gr.Button("Run Inference", variant="primary", elem_classes=["run-btn"])
+        run_btn.click(
+            fn=infer_fn,
+            inputs=[
+                image_input,
+                prompt_input,
+                negative_prompt_input,
+                seed_input,
+                gen_width_input,
+                gen_height_input,
+                cfg_scale_input,
+                inference_steps_input,
+            ],
+            outputs=[image_output, seed_input],
+        )
+        image_input.change(
+            fn=on_image_upload,
+            inputs=[image_input],
+            outputs=[gen_width_input, gen_height_input],
+        )
+        random_seed_btn.click(fn=randomize_seed, outputs=[seed_input])
+    return demo
+demo = build_demo(gr, LazyPipelineProxy(), _lazy_base64_to_image)
+demo.queue(default_concurrency_limit=1, max_size=8)
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+--extra-index-url https://download.pytorch.org/whl/cu126
+accelerate==1.10.1
+diffusers @ git+https://github.com/huggingface/diffusers.git@0e12ba74542c6ecb02719ec3e5c6e993b85556e3
+gradio>=5.49.1,<6
+grouped-gemm==0.3.0
+loguru>=0.7.3
+megablocks==0.10.0
+numpy<2.1.0
+pillow>=12.1.1
+qwen-vl-utils>=0.0.14
+safetensors>=0.7.0
+spaces>=0.35.0
+torch==2.7.0
+torchvision==0.22.0
+transformers==4.56.2
+triton>=3.3.0

src/infer_tagmoe.py ADDED Viewed

	@@ -0,0 +1,325 @@

+import base64
+import io
+import os
+import time
+from functools import partial
+from loguru import logger
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from PIL import Image
+from src.utils.device_utils import build_accelerate_max_memory_map
+from src.utils.inference_config import (
+    DEFAULT_NEGATIVE_PROMPT,
+    DEFAULT_NUM_INFERENCE_STEPS,
+    DEFAULT_SEED,
+    DEFAULT_TRUE_CFG_SCALE,
+    generate_random_seed,
+    normalize_negative_prompt,
+)
+from src.models.transformer_qwenimage_tagmoe import QwenImageTransformer2DModel, TRANSFORMER_NUM_LAYERS, MOE_NUM_EXPERTS
+from src.pipelines.pipeline_qwenimage_tagmoe import QwenImagePipeline
+def image_to_byte_array(image: Image) -> bytes:
+    imgByteArr = io.BytesIO()
+    image.save(imgByteArr, format="PNG")
+    imgByteArr = imgByteArr.getvalue()
+    return imgByteArr
+def image_to_base64(image: Image) -> str:
+    return base64.b64encode(image_to_byte_array(image)).decode()
+def base64_to_image(base64_str: str) -> Image:
+    return Image.open(io.BytesIO(base64.b64decode(base64_str))).convert("RGB")
+PREFERRED_QWENIMAGE_RESOLUTIONS = [
+    (512, 2048),
+    (512, 1984),
+    (512, 1920),
+    (512, 1856),
+    (512, 1792),
+    (512, 1728),
+    (512, 1664),
+    (512, 1600),
+    (512, 1536),
+    (576, 1472),
+    (640, 1408),
+    (704, 1344),
+    (768, 1280),
+    (832, 1216),
+    (896, 1152),
+    (960, 1088),
+    (1024, 1024),
+    (1088, 960),
+    (1152, 896),
+    (1216, 832),
+    (1280, 768),
+    (1344, 704),
+    (1408, 640),
+    (1472, 576),
+    (1536, 512),
+    (1600, 512),
+    (1664, 512),
+    (1728, 512),
+    (1792, 512),
+    (1856, 512),
+    (1920, 512),
+    (1984, 512),
+    (2048, 512),
+]
+QWEN_IMAGE_TRANSFORMER_BLOCK_DIM = 3072
+SEMANTIC_DIM = 512
+TAG_DICT = {
+    "local editing": 0,
+    "global editing": 1,
+    "multi region editing": 2,
+    "viewpoint editing": 3,
+    "content customization": 4,
+    "style customization": 5,
+    "object editing": 6,
+    "attribute editing": 7,
+    "style transfer": 8,
+    "pose editing": 9,
+    "background editing": 10,
+    "illumination editing": 11,
+    "structure preservation": 12,
+    "background preservation": 13,
+    "identity preservation": 14,
+    "face preservation": 15,
+    "style preservation": 16,
+    "image generation": 17,
+}
+class PredictionHead(nn.Module):
+    def __init__(self, gating_dim: int = 4, semantic_dim: int = 512, hidden_dim: int = 256):
+        super().__init__()
+        self.net = nn.Sequential(
+            nn.Linear(gating_dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, hidden_dim),
+            nn.GELU(),
+            nn.Linear(hidden_dim, semantic_dim),
+        )
+    def forward(self, g: torch.Tensor) -> torch.Tensor:
+        return self.net(g)
+class End2End:
+    def __init__(
+        self,
+        pretrained_model_path,
+        transformer_model_path=None,
+        rank=0,
+        device_ids=None,
+        transformer_weight_name: str = "diffusion_pytorch_model.safetensors",
+        transformer_subfolder: str | None = "transformer",
+        transformer_revision: str | None = None,
+        local_files_only: bool = False,
+    ):
+        self.device_ids = self._resolve_device_ids(rank, device_ids)
+        self.is_multi_gpu = len(self.device_ids) > 1
+        self.device, self.generator_device, torch_dtype = self._resolve_runtime_device()
+        transformer = self._build_runtime_transformer(pretrained_model_path, torch_dtype)
+        self.pipe = QwenImagePipeline.from_pretrained(
+            pretrained_model_path,
+            transformer=transformer,
+            torch_dtype=torch_dtype,
+        )
+        self.pipe.init_custom(
+            transformer_model_path,
+            weight_name=transformer_weight_name,
+            subfolder=transformer_subfolder,
+            revision=transformer_revision,
+            local_files_only=local_files_only,
+        )
+        if self.is_multi_gpu:
+            self._enable_multi_gpu_dispatch(torch_dtype=torch_dtype)
+        else:
+            self.pipe = self.pipe.to(self.device)
+    @staticmethod
+    def _resolve_device_ids(rank, device_ids):
+        if device_ids is None:
+            return [rank] if torch.cuda.is_available() else []
+        return list(device_ids)
+    def _resolve_runtime_device(self):
+        if len(self.device_ids) > 0 and torch.cuda.is_available():
+            primary_gpu = self.device_ids[0]
+            torch.cuda.set_device(primary_gpu)
+            device = f"cuda:{primary_gpu}"
+            return device, device, torch.bfloat16
+        return "cpu", "cpu", torch.float32
+    def _build_runtime_transformer(self, pretrained_model_path, torch_dtype):
+        transformer = QwenImageTransformer2DModel.from_pretrained(
+            pretrained_model_path,
+            subfolder="transformer",
+            torch_dtype=torch_dtype,
+        )
+        self._replace_mlp_with_runtime_moe(transformer)
+        self._attach_tag_modules(transformer)
+        return transformer
+    def _build_moe_args(self):
+        from megablocks.layers.arguments import Arguments
+        return Arguments(
+            hidden_size=QWEN_IMAGE_TRANSFORMER_BLOCK_DIM,
+            ffn_hidden_size=QWEN_IMAGE_TRANSFORMER_BLOCK_DIM * 4,
+            num_layers=TRANSFORMER_NUM_LAYERS,
+            bias=True,
+            activation_fn=partial(F.gelu, approximate="tanh"),
+            moe_num_experts=MOE_NUM_EXPERTS,
+            moe_top_k=1,
+            moe_loss_weight=0.01,
+            moe_capacity_factor=1.25,
+            mlp_type="mlp",
+            shared_expert=False,
+            mlp_impl="grouped",
+            init_method=nn.init.xavier_uniform_,
+            moe_expert_model_parallelism=False,
+            expert_parallel_group=None,
+            fp16=False,
+            bf16=True,
+            device=self.device,
+        )
+    def _replace_mlp_with_runtime_moe(self, transformer):
+        from megablocks.layers.dmoe import dMoE
+        moe_args = self._build_moe_args()
+        replace_from_layer = 60 - TRANSFORMER_NUM_LAYERS
+        replace_paths = []
+        for name, _ in transformer.named_modules():
+            if not name.startswith("transformer_blocks.") or not name.endswith("img_mlp"):
+                continue
+            block_idx = int(name.split(".")[1])
+            if block_idx >= replace_from_layer:
+                replace_paths.append(name)
+        for path in replace_paths:
+            parent_name, child_name = path.rsplit(".", 1)
+            parent_module = transformer.get_submodule(parent_name)
+            setattr(parent_module, child_name, dMoE(moe_args))
+    def _attach_tag_modules(self, transformer):
+        transformer.tag_embedding = nn.Embedding(len(TAG_DICT), SEMANTIC_DIM)
+        transformer.router_head = PredictionHead(
+            gating_dim=MOE_NUM_EXPERTS,
+            semantic_dim=SEMANTIC_DIM,
+            hidden_dim=256,
+        )
+    def _enable_multi_gpu_dispatch(self, torch_dtype):
+        from accelerate import dispatch_model, infer_auto_device_map
+        free_bytes_by_device = {}
+        for device_id in self.device_ids:
+            free_bytes, _ = torch.cuda.mem_get_info(device_id)
+            free_bytes_by_device[device_id] = free_bytes
+        max_memory = build_accelerate_max_memory_map(self.device_ids, free_bytes_by_device)
+        transformer_device_map = infer_auto_device_map(
+            self.pipe.transformer,
+            max_memory=max_memory,
+            no_split_module_classes=["QwenImageTransformerBlock"],
+            dtype=torch_dtype,
+        )
+        offload_dir = None
+        if any(device == "disk" for device in transformer_device_map.values()):
+            offload_dir = os.path.join("/tmp", "tag_moe_offload")
+            os.makedirs(offload_dir, exist_ok=True)
+        self.pipe.transformer = dispatch_model(
+            self.pipe.transformer,
+            device_map=transformer_device_map,
+            offload_dir=offload_dir,
+        )
+        text_encoder_device = f"cuda:{self.device_ids[-1]}"
+        self.pipe.text_encoder = self.pipe.text_encoder.to(text_encoder_device)
+        self.pipe.vae = self.pipe.vae.to(self.device)
+    def predict(self, input_dict):
+        out_dict = {}
+        start_time = time.time()
+        image = input_dict.get("image")
+        if image is None:
+            raise ValueError("Input image is required.")
+        seed = int(input_dict.get("seed", DEFAULT_SEED))
+        prompt = input_dict.get("prompt", "")
+        negative_prompt = normalize_negative_prompt(
+            input_dict.get("negative_prompt", DEFAULT_NEGATIVE_PROMPT)
+        )
+        num_inference_steps = int(
+            input_dict.get("num_inference_steps", DEFAULT_NUM_INFERENCE_STEPS)
+        )
+        true_cfg_scale = float(
+            input_dict.get("true_cfg_scale", DEFAULT_TRUE_CFG_SCALE)
+        )
+        target_height = input_dict.get("target_height", None)
+        target_width = input_dict.get("target_width", None)
+        keep_original_size = bool(input_dict.get("keep_original_size", False))
+        has_custom_target = target_height is not None or target_width is not None
+        if seed < 0:
+            seed = generate_random_seed()
+        out_dict["seed"] = seed
+        cond_image = image
+        w_ori, h_ori = cond_image.size
+        original_size = (w_ori, h_ori)
+        white_bg = Image.new("RGB", cond_image.size, (255, 255, 255))
+        if cond_image.mode == "RGBA":
+            result = Image.alpha_composite(white_bg.convert("RGBA"), cond_image)
+            cond_image = result.convert("RGB")
+        else:
+            cond_image = cond_image.convert("RGB")
+        aspect_ratio = w_ori / h_ori
+        _, snap_width, snap_height = min(
+            (abs(aspect_ratio - w / h), w, h) for w, h in PREFERRED_QWENIMAGE_RESOLUTIONS
+        )
+        cond_image = cond_image.resize((snap_width, snap_height), Image.LANCZOS)
+        if target_height is None:
+            target_height = snap_height
+        if target_width is None:
+            target_width = snap_width
+        out_image_pil = self.pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            width=target_width,
+            height=target_height,
+            num_inference_steps=num_inference_steps,
+            true_cfg_scale=true_cfg_scale,
+            generator=torch.Generator(device=self.generator_device).manual_seed(seed),
+            cond_image=cond_image,
+        ).images[0]
+        if keep_original_size and original_size is not None and not has_custom_target:
+            out_image_pil = out_image_pil.resize(original_size, Image.LANCZOS)
+        out_dict["generate_imgs_buffer"] = [image_to_base64(out_image_pil)]
+        logger.info(f"Generation time: {time.time()-start_time:.2f}s")
+        return out_dict

src/models/transformer_qwenimage_tagmoe.py ADDED Viewed

	@@ -0,0 +1,761 @@

+# Copyright 2025 Qwen-Image Team, The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import math
+from typing import Any, Dict, List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from functools import partial
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.utils import USE_PEFT_BACKEND, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_dispatch import dispatch_attention_fn
+from diffusers.models.attention_processor import Attention
+from diffusers.models.cache_utils import CacheMixin
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, RMSNorm
+from megablocks.layers.moe import MoE
+from megablocks.layers.dmoe import dMoE
+from megablocks.layers.arguments import Arguments
+from src.utils.device_utils import maybe_set_cuda_device_from_tensor
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+TRANSFORMER_NUM_LAYERS = 10
+TRANSFORMER_BLOCK_BAR = 60 - TRANSFORMER_NUM_LAYERS
+MOE_NUM_EXPERTS = 4
+def get_timestep_embedding(
+    timesteps: torch.Tensor,
+    embedding_dim: int,
+    flip_sin_to_cos: bool = False,
+    downscale_freq_shift: float = 1,
+    scale: float = 1,
+    max_period: int = 10000,
+) -> torch.Tensor:
+    """
+    This matches the implementation in Denoising Diffusion Probabilistic Models: Create sinusoidal timestep embeddings.
+    Args
+        timesteps (torch.Tensor):
+            a 1-D Tensor of N indices, one per batch element. These may be fractional.
+        embedding_dim (int):
+            the dimension of the output.
+        flip_sin_to_cos (bool):
+            Whether the embedding order should be `cos, sin` (if True) or `sin, cos` (if False)
+        downscale_freq_shift (float):
+            Controls the delta between frequencies between dimensions
+        scale (float):
+            Scaling factor applied to the embeddings.
+        max_period (int):
+            Controls the maximum frequency of the embeddings
+    Returns
+        torch.Tensor: an [N x dim] Tensor of positional embeddings.
+    """
+    assert len(timesteps.shape) == 1, "Timesteps should be a 1d-array"
+    half_dim = embedding_dim // 2
+    exponent = -math.log(max_period) * torch.arange(
+        start=0, end=half_dim, dtype=torch.float32, device=timesteps.device
+    )
+    exponent = exponent / (half_dim - downscale_freq_shift)
+    emb = torch.exp(exponent).to(timesteps.dtype)
+    emb = timesteps[:, None].float() * emb[None, :]
+    # scale embeddings
+    emb = scale * emb
+    # concat sine and cosine embeddings
+    emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=-1)
+    # flip sine and cosine embeddings
+    if flip_sin_to_cos:
+        emb = torch.cat([emb[:, half_dim:], emb[:, :half_dim]], dim=-1)
+    # zero pad
+    if embedding_dim % 2 == 1:
+        emb = torch.nn.functional.pad(emb, (0, 1, 0, 0))
+    return emb
+def apply_rotary_emb_qwen(
+    x: torch.Tensor,
+    freqs_cis: Union[torch.Tensor, Tuple[torch.Tensor]],
+    use_real: bool = True,
+    use_real_unbind_dim: int = -1,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    """
+    Apply rotary embeddings to input tensors using the given frequency tensor. This function applies rotary embeddings
+    to the given query or key 'x' tensors using the provided frequency tensor 'freqs_cis'. The input tensors are
+    reshaped as complex numbers, and the frequency tensor is reshaped for broadcasting compatibility. The resulting
+    tensors contain rotary embeddings and are returned as real tensors.
+    Args:
+        x (`torch.Tensor`):
+            Query or key tensor to apply rotary embeddings. [B, S, H, D] xk (torch.Tensor): Key tensor to apply
+        freqs_cis (`Tuple[torch.Tensor]`): Precomputed frequency tensor for complex exponentials. ([S, D], [S, D],)
+    Returns:
+        Tuple[torch.Tensor, torch.Tensor]: Tuple of modified query tensor and key tensor with rotary embeddings.
+    """
+    if use_real:
+        cos, sin = freqs_cis  # [S, D]
+        cos = cos[None, None]
+        sin = sin[None, None]
+        cos, sin = cos.to(x.device), sin.to(x.device)
+        if use_real_unbind_dim == -1:
+            # Used for flux, cogvideox, hunyuan-dit
+            x_real, x_imag = x.reshape(*x.shape[:-1], -1, 2).unbind(-1)  # [B, S, H, D//2]
+            x_rotated = torch.stack([-x_imag, x_real], dim=-1).flatten(3)
+        elif use_real_unbind_dim == -2:
+            # Used for Stable Audio, OmniGen, CogView4 and Cosmos
+            x_real, x_imag = x.reshape(*x.shape[:-1], 2, -1).unbind(-2)  # [B, S, H, D//2]
+            x_rotated = torch.cat([-x_imag, x_real], dim=-1)
+        else:
+            raise ValueError(f"`use_real_unbind_dim={use_real_unbind_dim}` but should be -1 or -2.")
+        out = (x.float() * cos + x_rotated.float() * sin).to(x.dtype)
+        return out
+    else:
+        x_rotated = torch.view_as_complex(x.float().reshape(*x.shape[:-1], -1, 2))
+        freqs_cis = freqs_cis.unsqueeze(1)
+        x_out = torch.view_as_real(x_rotated * freqs_cis).flatten(3)
+        return x_out.type_as(x)
+class QwenTimestepProjEmbeddings(nn.Module):
+    def __init__(self, embedding_dim):
+        super().__init__()
+        self.time_proj = Timesteps(num_channels=256, flip_sin_to_cos=True, downscale_freq_shift=0, scale=1000)
+        self.timestep_embedder = TimestepEmbedding(in_channels=256, time_embed_dim=embedding_dim)
+    def forward(self, timestep, hidden_states):
+        timesteps_proj = self.time_proj(timestep)
+        timesteps_emb = self.timestep_embedder(timesteps_proj.to(dtype=hidden_states.dtype))  # (N, D)
+        conditioning = timesteps_emb
+        return conditioning
+class QwenEmbedRope(nn.Module):
+    def __init__(self, theta: int, axes_dim: List[int], scale_rope=False):
+        super().__init__()
+        self.theta = theta
+        self.axes_dim = axes_dim
+        pos_index = torch.arange(4096)
+        neg_index = torch.arange(4096).flip(0) * -1 - 1
+        self.pos_freqs = torch.cat(
+            [
+                self.rope_params(pos_index, self.axes_dim[0], self.theta),
+                self.rope_params(pos_index, self.axes_dim[1], self.theta),
+                self.rope_params(pos_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        self.neg_freqs = torch.cat(
+            [
+                self.rope_params(neg_index, self.axes_dim[0], self.theta),
+                self.rope_params(neg_index, self.axes_dim[1], self.theta),
+                self.rope_params(neg_index, self.axes_dim[2], self.theta),
+            ],
+            dim=1,
+        )
+        self.rope_cache = {}
+        self.cond_rope_cache = {}
+        # 是否使用 scale rope
+        self.scale_rope = scale_rope
+    def rope_params(self, index, dim, theta=10000):
+        """
+        Args:
+            index: [0, 1, 2, 3] 1D Tensor representing the position index of the token
+        """
+        assert dim % 2 == 0
+        freqs = torch.outer(index, 1.0 / torch.pow(theta, torch.arange(0, dim, 2).to(torch.float32).div(dim)))
+        freqs = torch.polar(torch.ones_like(freqs), freqs)
+        return freqs
+    def forward(self, video_fhw, txt_seq_lens, device):
+        """
+        Args: video_fhw: [frame, height, width] a list of 3 integers representing the shape of the video Args:
+        txt_length: [bs] a list of 1 integers representing the length of the text
+        """
+        if self.pos_freqs.device != device:
+            self.pos_freqs = self.pos_freqs.to(device)
+            self.neg_freqs = self.neg_freqs.to(device)
+        if isinstance(video_fhw, list):
+            video_fhw = video_fhw[0]
+        frame, height, width = video_fhw
+        rope_key = f"{frame}_{height}_{width}"
+        if rope_key not in self.rope_cache:
+            seq_lens = frame * height * width
+            freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+            freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+            freqs_frame = freqs_pos[0][:frame].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+            if self.scale_rope:
+                freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+                freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+                freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+                freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+            else:
+                freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+                freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+            freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+            self.rope_cache[rope_key] = freqs.clone().contiguous()
+        vid_freqs = self.rope_cache[rope_key]
+        if self.scale_rope:
+            max_vid_index = max(height // 2, width // 2)
+        else:
+            max_vid_index = max(height, width)
+        max_len = max(txt_seq_lens)
+        txt_freqs = self.pos_freqs[max_vid_index : max_vid_index + max_len, ...]
+        return vid_freqs, txt_freqs
+    def get_img_rope(self, video_fhw, device, frame_idx=0):
+        if self.pos_freqs.device != device:
+            self.pos_freqs = self.pos_freqs.to(device)
+            self.neg_freqs = self.neg_freqs.to(device)
+        if isinstance(video_fhw, list):
+            video_fhw = video_fhw[0]
+        frame, height, width = video_fhw
+        rope_key = f"{frame}_{height}_{width}_{frame_idx}"
+        assert frame == 1
+        if rope_key not in self.cond_rope_cache:
+            seq_lens = frame * height * width
+            freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+            freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+            freqs_frame = freqs_pos[0][frame_idx:frame_idx+1].view(frame, 1, 1, -1).expand(frame, height, width, -1)
+            if self.scale_rope:
+                freqs_height = torch.cat([freqs_neg[1][-(height - height // 2) :], freqs_pos[1][: height // 2]], dim=0)
+                freqs_height = freqs_height.view(1, height, 1, -1).expand(frame, height, width, -1)
+                freqs_width = torch.cat([freqs_neg[2][-(width - width // 2) :], freqs_pos[2][: width // 2]], dim=0)
+                freqs_width = freqs_width.view(1, 1, width, -1).expand(frame, height, width, -1)
+            else:
+                freqs_height = freqs_pos[1][:height].view(1, height, 1, -1).expand(frame, height, width, -1)
+                freqs_width = freqs_pos[2][:width].view(1, 1, width, -1).expand(frame, height, width, -1)
+            freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+            self.cond_rope_cache[rope_key] = freqs.clone().contiguous()
+        vid_freqs = self.cond_rope_cache[rope_key]
+        return vid_freqs
+    def get_img_rope_by_bbox(self, video_fhw, bbox, device):
+        if self.pos_freqs.device != device:
+            self.pos_freqs = self.pos_freqs.to(device)
+            self.neg_freqs = self.neg_freqs.to(device)
+        if isinstance(video_fhw, list):
+            video_fhw = video_fhw[0]
+        frame, height, width = video_fhw
+        x1, y1, x2, y2 = bbox
+        x0 = -(width - width // 2)
+        y0 = -(height - height // 2)
+        seq_lens = frame * ((y2-y1)+1) * ((x2-x1)+1)
+        freqs_pos = self.pos_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_neg = self.neg_freqs.split([x // 2 for x in self.axes_dim], dim=1)
+        freqs_frame = freqs_pos[0][:frame].view(frame, 1, 1, -1).expand(frame, (y2-y1)+1, (x2-x1)+1, -1)
+        index_height_neg = [y + y0 for y in range(y1, y2 + 1, 1) if (y + y0) < 0]
+        index_height_pos = [y + y0 for y in range(y1, y2 + 1, 1) if (y + y0) >= 0]
+        freqs_height = torch.cat([freqs_neg[1][index_height_neg], freqs_pos[1][index_height_pos]], dim=0)
+        freqs_height = freqs_height.view(1, (y2-y1)+1, 1, -1).expand(frame, (y2-y1)+1, (x2-x1)+1, -1)
+        index_width_neg = [x + x0 for x in range(x1, x2 + 1, 1) if (x + x0) < 0]
+        index_width_pos = [x + x0 for x in range(x1, x2 + 1, 1) if (x + x0) >= 0]
+        freqs_width = torch.cat([freqs_neg[2][index_width_neg], freqs_pos[2][index_width_pos]], dim=0)
+        freqs_width = freqs_width.view(1, 1, (x2-x1)+1, -1).expand(frame, (y2-y1)+1, (x2-x1)+1, -1)
+        freqs = torch.cat([freqs_frame, freqs_height, freqs_width], dim=-1).reshape(seq_lens, -1)
+        vid_freqs = freqs
+        return vid_freqs
+class QwenDoubleStreamAttnProcessor2_0:
+    """
+    Attention processor for Qwen double-stream architecture, matching DoubleStreamLayerMegatron logic. This processor
+    implements joint attention computation where text and image streams are processed together.
+    """
+    _attention_backend = None
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "QwenDoubleStreamAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,  # Image stream
+        encoder_hidden_states: torch.FloatTensor = None,  # Text stream
+        encoder_hidden_states_mask: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        if encoder_hidden_states is None:
+            raise ValueError("QwenDoubleStreamAttnProcessor2_0 requires encoder_hidden_states (text stream)")
+        seq_txt = encoder_hidden_states.shape[1]
+        # Compute QKV for image stream (sample projections)
+        img_query = attn.to_q(hidden_states)
+        img_key = attn.to_k(hidden_states)
+        img_value = attn.to_v(hidden_states)
+        # Compute QKV for text stream (context projections)
+        txt_query = attn.add_q_proj(encoder_hidden_states)
+        txt_key = attn.add_k_proj(encoder_hidden_states)
+        txt_value = attn.add_v_proj(encoder_hidden_states)
+        # Reshape for multi-head attention
+        img_query = img_query.unflatten(-1, (attn.heads, -1))
+        img_key = img_key.unflatten(-1, (attn.heads, -1))
+        img_value = img_value.unflatten(-1, (attn.heads, -1))
+        txt_query = txt_query.unflatten(-1, (attn.heads, -1))
+        txt_key = txt_key.unflatten(-1, (attn.heads, -1))
+        txt_value = txt_value.unflatten(-1, (attn.heads, -1))
+        # Apply QK normalization
+        if attn.norm_q is not None:
+            img_query = attn.norm_q(img_query)
+        if attn.norm_k is not None:
+            img_key = attn.norm_k(img_key)
+        if attn.norm_added_q is not None:
+            txt_query = attn.norm_added_q(txt_query)
+        if attn.norm_added_k is not None:
+            txt_key = attn.norm_added_k(txt_key)
+        # Apply RoPE
+        if image_rotary_emb is not None:
+            img_freqs, txt_freqs = image_rotary_emb
+            img_query = apply_rotary_emb_qwen(img_query, img_freqs, use_real=False)
+            img_key = apply_rotary_emb_qwen(img_key, img_freqs, use_real=False)
+            txt_query = apply_rotary_emb_qwen(txt_query, txt_freqs, use_real=False)
+            txt_key = apply_rotary_emb_qwen(txt_key, txt_freqs, use_real=False)
+        # Concatenate for joint attention
+        # Order: [text, image]
+        joint_query = torch.cat([txt_query, img_query], dim=1)
+        joint_key = torch.cat([txt_key, img_key], dim=1)
+        joint_value = torch.cat([txt_value, img_value], dim=1)
+        # Compute joint attention
+        joint_hidden_states = dispatch_attention_fn(
+            joint_query,
+            joint_key,
+            joint_value,
+            attn_mask=attention_mask,
+            dropout_p=0.0,
+            is_causal=False,
+            backend=self._attention_backend,
+        )
+        # Reshape back
+        joint_hidden_states = joint_hidden_states.flatten(2, 3)
+        joint_hidden_states = joint_hidden_states.to(joint_query.dtype)
+        # Split attention outputs back
+        txt_attn_output = joint_hidden_states[:, :seq_txt, :]  # Text part
+        img_attn_output = joint_hidden_states[:, seq_txt:, :]  # Image part
+        # Apply output projections
+        img_attn_output = attn.to_out[0](img_attn_output)
+        if len(attn.to_out) > 1:
+            img_attn_output = attn.to_out[1](img_attn_output)  # dropout
+        txt_attn_output = attn.to_add_out(txt_attn_output)
+        return img_attn_output, txt_attn_output
+@maybe_allow_in_graph
+class QwenImageTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, block_index: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        # Image processing modules
+        self.img_mod = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(dim, 6 * dim, bias=True),  # For scale, shift, gate for norm1 and norm2
+        )
+        self.img_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,  # Enable cross attention for joint computation
+            added_kv_proj_dim=dim,  # Enable added KV projections for text stream
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=QwenDoubleStreamAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.img_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.block_index = block_index
+        if block_index < TRANSFORMER_BLOCK_BAR:  # Replace last part of layers with MoE
+            self.img_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        else:
+            self.moe_args = Arguments(
+            hidden_size=dim,
+            ffn_hidden_size=dim*4,  # Keep ffn_hidden_size consistent with FeedForward mult=4
+            num_layers=TRANSFORMER_NUM_LAYERS,  # Number of MoE layers
+            bias=True,
+            activation_fn=partial(F.gelu, approximate='tanh'),  # Keep consistent with FeedForward
+            moe_num_experts=MOE_NUM_EXPERTS,  # Number of experts; adjust as needed
+            moe_top_k=1,  # Top-k experts per token (1 means top-1)
+            moe_loss_weight=0.01,  # Load balancing loss weight
+            moe_capacity_factor=1.25,  # Capacity factor for handling load imbalance
+            mlp_type="mlp",
+            shared_expert=False,  # Do not use shared experts
+            mlp_impl="grouped",  # Use 'grouped' implementation
+            init_method=nn.init.xavier_uniform_,
+            memory_optimized_mlp=True, # Optimize MLP activation memory
+                moe_expert_model_parallelism=False,
+                expert_parallel_group=None,
+                fp16=False,
+                bf16=True,
+            )
+            self.img_mlp = dMoE(self.moe_args)
+        # Text processing modules
+        self.txt_mod = nn.Sequential(
+            nn.SiLU(),
+            nn.Linear(dim, 6 * dim, bias=True),  # For scale, shift, gate for norm1 and norm2
+        )
+        self.txt_norm1 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        # Text doesn't need separate attention - it's handled by img_attn joint computation
+        self.txt_norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=eps)
+        self.txt_mlp = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+    def _modulate(self, x, mod_params):
+        """Apply modulation to input tensor"""
+        shift, scale, gate = mod_params.chunk(3, dim=-1)
+        return x * (1 + scale.unsqueeze(1)) + shift.unsqueeze(1), gate.unsqueeze(1)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        encoder_hidden_states_mask: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        img_shapes=None,
+        timestep=None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        maybe_set_cuda_device_from_tensor(hidden_states)
+        # Get modulation parameters for both streams
+        img_mod_params = self.img_mod(temb)  # [B, 6*dim]
+        txt_mod_params = self.txt_mod(temb)  # [B, 6*dim]
+        # Split modulation parameters for norm1 and norm2
+        img_mod1, img_mod2 = img_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
+        txt_mod1, txt_mod2 = txt_mod_params.chunk(2, dim=-1)  # Each [B, 3*dim]
+        # Process image stream - norm1 + modulation
+        img_normed = self.img_norm1(hidden_states)
+        img_modulated, img_gate1 = self._modulate(img_normed, img_mod1)
+        # Process text stream - norm1 + modulation
+        txt_normed = self.txt_norm1(encoder_hidden_states)
+        txt_modulated, txt_gate1 = self._modulate(txt_normed, txt_mod1)
+        # Use QwenAttnProcessor2_0 for joint attention computation
+        # This directly implements the DoubleStreamLayerMegatron logic:
+        # 1. Computes QKV for both streams
+        # 2. Applies QK normalization and RoPE
+        # 3. Concatenates and runs joint attention
+        # 4. Splits results back to separate streams
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=img_modulated,  # Image stream (will be processed as "sample")
+            encoder_hidden_states=txt_modulated,  # Text stream (will be processed as "context")
+            encoder_hidden_states_mask=encoder_hidden_states_mask,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        # QwenAttnProcessor2_0 returns (img_output, txt_output) when encoder_hidden_states is provided
+        img_attn_output, txt_attn_output = attn_output
+        # Apply attention gates and add residual (like in Megatron)
+        hidden_states = hidden_states + img_gate1 * img_attn_output
+        encoder_hidden_states = encoder_hidden_states + txt_gate1 * txt_attn_output
+        # Process image stream - norm2 + MLP
+        img_normed2 = self.img_norm2(hidden_states)
+        img_modulated2, img_gate2 = self._modulate(img_normed2, img_mod2)
+        if self.block_index < TRANSFORMER_BLOCK_BAR:
+            img_mlp_output = self.img_mlp(img_modulated2)
+        else:
+            # dMoE.forward returns (output, bias) due to return_bias=True default
+            img_mlp_output = self.img_mlp(img_modulated2)[0]
+        hidden_states = hidden_states + img_gate2 * img_mlp_output
+        # Process text stream - norm2 + MLP
+        txt_normed2 = self.txt_norm2(encoder_hidden_states)
+        txt_modulated2, txt_gate2 = self._modulate(txt_normed2, txt_mod2)
+        txt_mlp_output = self.txt_mlp(txt_modulated2)
+        encoder_hidden_states = encoder_hidden_states + txt_gate2 * txt_mlp_output
+        # Clip to prevent overflow for fp16
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states
+class QwenImageTransformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, CacheMixin):
+    """
+    The Transformer model introduced in Qwen.
+    Args:
+        patch_size (`int`, defaults to `2`):
+            Patch size to turn the input data into small patches.
+        in_channels (`int`, defaults to `64`):
+            The number of channels in the input.
+        out_channels (`int`, *optional*, defaults to `None`):
+            The number of channels in the output. If not specified, it defaults to `in_channels`.
+        num_layers (`int`, defaults to `60`):
+            The number of layers of dual stream DiT blocks to use.
+        attention_head_dim (`int`, defaults to `128`):
+            The number of dimensions to use for each attention head.
+        num_attention_heads (`int`, defaults to `24`):
+            The number of attention heads to use.
+        joint_attention_dim (`int`, defaults to `3584`):
+            The number of dimensions to use for the joint attention (embedding/channel dimension of
+            `encoder_hidden_states`).
+        guidance_embeds (`bool`, defaults to `False`):
+            Whether to use guidance embeddings for guidance-distilled variant of the model.
+        axes_dims_rope (`Tuple[int]`, defaults to `(16, 56, 56)`):
+            The dimensions to use for the rotary positional embeddings.
+    """
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["QwenImageTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 2,
+        in_channels: int = 64,
+        out_channels: Optional[int] = 16,
+        num_layers: int = 60,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 3584,
+        guidance_embeds: bool = False,  # TODO: this should probably be removed
+        axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = QwenEmbedRope(theta=10000, axes_dim=list(axes_dims_rope), scale_rope=True)
+        self.time_text_embed = QwenTimestepProjEmbeddings(embedding_dim=self.inner_dim)
+        self.txt_norm = RMSNorm(joint_attention_dim, eps=1e-6)
+        self.img_in = nn.Linear(in_channels, self.inner_dim)
+        self.txt_in = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                QwenImageTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                    block_index=block_index,
+                )
+                for block_index in range(num_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        encoder_hidden_states_mask: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_shapes: Optional[List[Tuple[int, int, int]]] = None,
+        txt_seq_lens: Optional[List[int]] = None,
+        guidance: torch.Tensor = None,  # TODO: this should probably be removed
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+        cond_hidden_states = None,
+        cond_rope = None,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        """
+        The [`QwenTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            encoder_hidden_states_mask (`torch.Tensor` of shape `(batch_size, text_sequence_length)`):
+                Mask of the input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if attention_kwargs is not None:
+            attention_kwargs = attention_kwargs.copy()
+            lora_scale = attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if attention_kwargs is not None and attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        if cond_hidden_states is not None:
+            length_raw_hidden_states = hidden_states.shape[1]
+            hidden_states = torch.cat([hidden_states, cond_hidden_states], dim=1)
+        hidden_states = self.img_in(hidden_states)
+        timestep = timestep.to(hidden_states.dtype)
+        encoder_hidden_states = self.txt_norm(encoder_hidden_states)
+        encoder_hidden_states = self.txt_in(encoder_hidden_states)
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        temb = (
+            self.time_text_embed(timestep, hidden_states)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, hidden_states)
+        )
+        image_rotary_emb = self.pos_embed(img_shapes, txt_seq_lens, device=hidden_states.device)
+        if cond_rope is not None:
+            img_freqs, txt_freqs = image_rotary_emb
+            img_freqs = torch.cat([img_freqs, cond_rope], dim=0)
+            image_rotary_emb = img_freqs, txt_freqs
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    encoder_hidden_states_mask,
+                    temb,
+                    image_rotary_emb,
+                    attention_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_hidden_states_mask=encoder_hidden_states_mask,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    img_shapes=img_shapes,
+                    timestep=timestep,
+                    joint_attention_kwargs=attention_kwargs,
+                )
+        if cond_hidden_states is not None:
+            hidden_states = hidden_states[:, :length_raw_hidden_states]
+        # Use only the image part (hidden_states) from the dual-stream blocks
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

src/pipelines/pipeline_qwenimage_tagmoe.py ADDED Viewed

	@@ -0,0 +1,1068 @@

+# Copyright 2025 Qwen-Image Team and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+import os
+from typing import Any, Callable, Dict, List, Optional, Union
+from PIL import Image
+import numpy as np
+import torch
+from transformers import Qwen2_5_VLForConditionalGeneration, Qwen2Tokenizer, AutoProcessor
+from diffusers.image_processor import VaeImageProcessor
+from diffusers.loaders import QwenImageLoraLoaderMixin
+from diffusers.models import AutoencoderKLQwenImage
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import is_torch_xla_available, logging, replace_example_docstring
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.qwenimage.pipeline_output import QwenImagePipelineOutput
+from qwen_vl_utils import process_vision_info
+from src.models.transformer_qwenimage_tagmoe import QwenImageTransformer2DModel
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+EXAMPLE_DOC_STRING = """
+    Examples:
+        ```py
+        >>> import torch
+        >>> from diffusers import QwenImagePipeline
+        >>> pipe = QwenImagePipeline.from_pretrained("Qwen/Qwen-Image", torch_dtype=torch.bfloat16)
+        >>> pipe.to("cuda")
+        >>> prompt = "A cat holding a sign that says hello world"
+        >>> # Depending on the variant being used, the pipeline call will slightly vary.
+        >>> # Refer to the pipeline documentation for more details.
+        >>> image = pipe(prompt, num_inference_steps=50).images[0]
+        >>> image.save("qwenimage.png")
+        ```
+"""
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,
+):
+    r"""
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class QwenImagePipeline(DiffusionPipeline, QwenImageLoraLoaderMixin):
+    r"""
+    The QwenImage pipeline for text-to-image generation.
+    Args:
+        transformer ([`QwenImageTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`Qwen2.5-VL-7B-Instruct`]):
+            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct), specifically the
+            [Qwen2.5-VL-7B-Instruct](https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct) variant.
+        tokenizer (`QwenTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+    """
+    model_cpu_offload_seq = "text_encoder->transformer->vae"
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKLQwenImage,
+        text_encoder: Qwen2_5_VLForConditionalGeneration,
+        tokenizer: Qwen2Tokenizer,
+        transformer: QwenImageTransformer2DModel,
+        vlm_processor: AutoProcessor = None,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            transformer=transformer,
+            scheduler=scheduler,
+            vlm_processor=vlm_processor,
+        )
+        self.vae_scale_factor = 2 ** len(self.vae.temperal_downsample) if getattr(self, "vae", None) else 8
+        # QwenImage latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
+        # by the patch size. So the vae scale factor is multiplied by the patch size to account for this
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
+        self.tokenizer_max_length = 1024
+        self.prompt_template_encode = "<|im_start|>system\nDescribe the image by detailing the color, shape, size, texture, quantity, text, spatial relationships of the objects and background:<|im_end|>\n<|im_start|>user\n{}<|im_end|>\n<|im_start|>assistant\n"
+        self.prompt_template_encode_start_idx = 34
+        self.default_sample_size = 128
+    def _extract_masked_hidden(self, hidden_states: torch.Tensor, mask: torch.Tensor):
+        bool_mask = mask.bool()
+        valid_lengths = bool_mask.sum(dim=1)
+        selected = hidden_states[bool_mask]
+        split_result = torch.split(selected, valid_lengths.tolist(), dim=0)
+        return split_result
+    @staticmethod
+    def _get_module_input_device(module):
+        device_map = getattr(module, "hf_device_map", None)
+        if device_map is not None:
+            for mapped_device in device_map.values():
+                if mapped_device in ("cpu", "disk"):
+                    continue
+                if isinstance(mapped_device, int):
+                    return torch.device("cuda", mapped_device)
+                return torch.device(mapped_device)
+        return next(module.parameters()).device
+    def _get_qwen_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        template = self.prompt_template_encode
+        drop_idx = self.prompt_template_encode_start_idx
+        txt = [template.format(e) for e in prompt]
+        text_encoder_device = self._get_module_input_device(self.text_encoder)
+        txt_tokens = self.tokenizer(
+            txt, max_length=self.tokenizer_max_length + drop_idx, padding=True, truncation=True, return_tensors="pt"
+        ).to(text_encoder_device)
+        encoder_hidden_states = self.text_encoder(
+            input_ids=txt_tokens.input_ids,
+            attention_mask=txt_tokens.attention_mask,
+            output_hidden_states=True,
+        )
+        hidden_states = encoder_hidden_states.hidden_states[-1]
+        split_hidden_states = self._extract_masked_hidden(hidden_states, txt_tokens.attention_mask)
+        split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+        attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+        max_seq_len = max([e.size(0) for e in split_hidden_states])
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
+        )
+        encoder_attention_mask = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
+        )
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        return prompt_embeds, encoder_attention_mask
+    def _get_qwenvl_prompt_embeds(
+        self,
+        prompt: Union[str, List[str]] = None,
+        device: Optional[torch.device] = None,
+        dtype: Optional[torch.dtype] = None,
+        image: Optional[Image.Image] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        assert len(prompt) == 1
+        template = self.prompt_template_encode
+        drop_idx = self.prompt_template_encode_start_idx
+        messages = [
+            {
+                "role": "system",
+                "content": [{"type": "text", "text": f"{template}"}],
+            },
+            {
+                "role": "user",
+                "content": []
+            }
+        ]
+        # 先添加所有的 image
+        # messages[0]["content"].extend([{"type": "image", "image": img} for img in image_list])
+        messages[1]['content'].append({"type": "image", "image": image})
+        # print(text)
+        # 再添加 text
+        messages[1]["content"].append({"type": "text", "text": f"{prompt[0]}"})
+        # Preparation for inference
+        text = self.vlm_processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True, add_vision_id=True
+        )
+        image_inputs, video_inputs = process_vision_info(messages)
+        kwargs = dict(truncation=True, padding=True, max_length=self.tokenizer_max_length + drop_idx + 374, return_tensors="pt")
+        txt_tokens = self.vlm_processor(
+            text=[text],
+            images=image_inputs,
+            **kwargs,
+        )
+        text_encoder_device = self._get_module_input_device(self.text_encoder)
+        encoder_hidden_states = self.text_encoder(
+            input_ids=txt_tokens.input_ids.to(text_encoder_device),
+            attention_mask=txt_tokens.attention_mask.to(text_encoder_device),
+            pixel_values=txt_tokens.pixel_values.to(text_encoder_device),
+            image_grid_thw=txt_tokens.image_grid_thw.to(text_encoder_device),
+            output_hidden_states=True
+        )
+        hidden_states = encoder_hidden_states.hidden_states[-1]
+        split_hidden_states = self._extract_masked_hidden(hidden_states, txt_tokens.attention_mask)
+        split_hidden_states = [e[drop_idx:] for e in split_hidden_states]
+        attn_mask_list = [torch.ones(e.size(0), dtype=torch.long, device=e.device) for e in split_hidden_states]
+        max_seq_len = max([e.size(0) for e in split_hidden_states])
+        prompt_embeds = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0), u.size(1))]) for u in split_hidden_states]
+        )
+        encoder_attention_mask = torch.stack(
+            [torch.cat([u, u.new_zeros(max_seq_len - u.size(0))]) for u in attn_mask_list]
+        )
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        return prompt_embeds, encoder_attention_mask
+    def encode_prompt(
+        self,
+        prompt: Union[str, List[str]],
+        device: Optional[torch.device] = None,
+        num_images_per_prompt: int = 1,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        max_sequence_length: int = 1024,
+        image=None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+        """
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt) if prompt_embeds is None else prompt_embeds.shape[0]
+        if image is not None:
+            if self.vlm_processor is None:
+                raise ValueError(
+                    "VLM processor is not initialized. Please make sure to pass a valid VLM processor to the pipeline."
+                )
+            prompt_embeds, prompt_embeds_mask = self._get_qwenvl_prompt_embeds(
+                prompt=prompt, device=device, dtype=self.text_encoder.dtype, image=image
+            )
+        elif prompt_embeds is None:
+            prompt_embeds, prompt_embeds_mask = self._get_qwen_prompt_embeds(prompt, device)
+        _, seq_len, _ = prompt_embeds.shape
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        prompt_embeds_mask = prompt_embeds_mask.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds_mask = prompt_embeds_mask.view(batch_size * num_images_per_prompt, seq_len)
+        return prompt_embeds, prompt_embeds_mask
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        negative_prompt=None,
+        prompt_embeds=None,
+        negative_prompt_embeds=None,
+        prompt_embeds_mask=None,
+        negative_prompt_embeds_mask=None,
+        callback_on_step_end_tensor_inputs=None,
+        max_sequence_length=None,
+    ):
+        if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
+            logger.warning(
+                f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if negative_prompt is not None and negative_prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
+                f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
+            )
+        if prompt_embeds is not None and prompt_embeds_mask is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `prompt_embeds_mask` also have to be passed. Make sure to generate `prompt_embeds_mask` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if negative_prompt_embeds is not None and negative_prompt_embeds_mask is None:
+            raise ValueError(
+                "If `negative_prompt_embeds` are provided, `negative_prompt_embeds_mask` also have to be passed. Make sure to generate `negative_prompt_embeds_mask` from the same text encoder that was used to generate `negative_prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 1024:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 1024 but is {max_sequence_length}")
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height, width, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (vae_scale_factor * 2))
+        width = 2 * (int(width) // (vae_scale_factor * 2))
+        latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), 1, height, width)
+        return latents
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+        self,
+        batch_size,
+        num_channels_latents,
+        height,
+        width,
+        dtype,
+        device,
+        generator,
+        latents=None,
+    ):
+        # VAE applies 8x compression on images but we must also account for packing which requires
+        # latent height and width to be divisible by 2.
+        height = 2 * (int(height) // (self.vae_scale_factor * 2))
+        width = 2 * (int(width) // (self.vae_scale_factor * 2))
+        shape = (batch_size, 1, num_channels_latents, height, width)
+        if latents is not None:
+            latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+            return latents.to(device=device, dtype=dtype), latent_image_ids
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)
+        latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
+        return latents, latent_image_ids
+    @staticmethod
+    def _candidate_index_names(weight_name: Optional[str]) -> List[str]:
+        candidate_names = []
+        if weight_name:
+            if weight_name.endswith(".index.json"):
+                candidate_names.append(weight_name)
+            else:
+                candidate_names.append(f"{weight_name}.index.json")
+        for default_name in (
+            "diffusion_pytorch_model.safetensors.index.json",
+            "diffusion_pytorch_model.bin.index.json",
+        ):
+            if default_name not in candidate_names:
+                candidate_names.append(default_name)
+        return candidate_names
+    @staticmethod
+    def _dedupe_paths(paths: List[str]) -> List[str]:
+        deduped_paths = []
+        seen = set()
+        for path in paths:
+            normalized = os.path.normpath(path)
+            if normalized in seen:
+                continue
+            deduped_paths.append(path)
+            seen.add(normalized)
+        return deduped_paths
+    def _resolve_custom_weights_files(
+        self,
+        weight_source: str,
+        weight_name: str = "diffusion_pytorch_model.safetensors",
+        subfolder: Optional[str] = "transformer",
+        cache_dir: Optional[str] = None,
+        revision: Optional[str] = None,
+        local_files_only: bool = False,
+    ) -> tuple[List[str], Optional[str]]:
+        from diffusers.utils.hub_utils import _get_checkpoint_shard_files, _get_model_file
+        if os.path.isfile(weight_source):
+            return [weight_source], None
+        index_name_candidates = self._candidate_index_names(weight_name)
+        normalized_subfolder = subfolder or ""
+        if os.path.isdir(weight_source):
+            candidate_paths: List[str] = []
+            if weight_name:
+                candidate_paths.append(os.path.join(weight_source, weight_name))
+            if subfolder and weight_name:
+                candidate_paths.append(os.path.join(weight_source, subfolder, weight_name))
+            candidate_paths = self._dedupe_paths(candidate_paths)
+            for candidate in candidate_paths:
+                if os.path.isfile(candidate):
+                    return [candidate], None
+            candidate_index_paths: List[str] = []
+            for index_name in index_name_candidates:
+                candidate_index_paths.append(os.path.join(weight_source, index_name))
+                if subfolder:
+                    candidate_index_paths.append(os.path.join(weight_source, subfolder, index_name))
+            candidate_index_paths = self._dedupe_paths(candidate_index_paths)
+            for index_path in candidate_index_paths:
+                if not os.path.isfile(index_path):
+                    continue
+                shard_subfolder = os.path.relpath(os.path.dirname(index_path), weight_source)
+                if shard_subfolder == ".":
+                    shard_subfolder = ""
+                shard_files, _ = _get_checkpoint_shard_files(
+                    pretrained_model_name_or_path=weight_source,
+                    index_filename=index_path,
+                    subfolder=shard_subfolder,
+                    local_files_only=True,
+                )
+                return shard_files, index_path
+            raise FileNotFoundError(
+                f"Cannot find transformer weights under directory '{weight_source}'. "
+                f"Tried files: {candidate_paths}. Tried index files: {candidate_index_paths}"
+            )
+        try:
+            resolved_file = _get_model_file(
+                pretrained_model_name_or_path=weight_source,
+                weights_name=weight_name,
+                subfolder=subfolder,
+                cache_dir=cache_dir,
+                local_files_only=local_files_only,
+                revision=revision,
+            )
+            return [resolved_file], None
+        except EnvironmentError as single_file_error:
+            for index_name in index_name_candidates:
+                try:
+                    index_file = _get_model_file(
+                        pretrained_model_name_or_path=weight_source,
+                        weights_name=index_name,
+                        subfolder=subfolder,
+                        cache_dir=cache_dir,
+                        local_files_only=local_files_only,
+                        revision=revision,
+                    )
+                except EnvironmentError:
+                    continue
+                shard_files, _ = _get_checkpoint_shard_files(
+                    pretrained_model_name_or_path=weight_source,
+                    index_filename=index_file,
+                    cache_dir=cache_dir,
+                    local_files_only=local_files_only,
+                    revision=revision,
+                    subfolder=normalized_subfolder,
+                )
+                return shard_files, index_file
+            raise single_file_error
+    @staticmethod
+    def _unwrap_state_dict(checkpoint: Any) -> Dict[str, torch.Tensor]:
+        if not isinstance(checkpoint, dict):
+            return checkpoint
+        for key in ("model", "state_dict", "transformer"):
+            value = checkpoint.get(key)
+            if isinstance(value, dict):
+                return value
+        return checkpoint
+    def init_custom(
+        self,
+        weight_source: Optional[str],
+        weight_name: str = "diffusion_pytorch_model.safetensors",
+        subfolder: Optional[str] = "transformer",
+        cache_dir: Optional[str] = None,
+        revision: Optional[str] = None,
+        local_files_only: bool = False,
+    ):
+        if weight_source is None:
+            return
+        weights_files, index_file = self._resolve_custom_weights_files(
+            weight_source=weight_source,
+            weight_name=weight_name,
+            subfolder=subfolder,
+            cache_dir=cache_dir,
+            revision=revision,
+            local_files_only=local_files_only,
+        )
+        from safetensors.torch import load_file
+        all_unexpected_keys = []
+        for weights_file in weights_files:
+            if weights_file.endswith(".safetensors"):
+                model_weights = load_file(weights_file)
+            else:
+                try:
+                    checkpoint = torch.load(weights_file, weights_only=True, map_location="cpu")
+                except TypeError:
+                    checkpoint = torch.load(weights_file, map_location="cpu")
+                model_weights = self._unwrap_state_dict(checkpoint)
+            load_result = self.transformer.load_state_dict(model_weights, strict=False, assign=True)
+            if len(load_result.unexpected_keys) > 0:
+                all_unexpected_keys.extend(load_result.unexpected_keys)
+            del model_weights
+        if index_file is not None:
+            logger.info(f"Loaded transformer weights from {len(weights_files)} shards via index: {index_file}")
+        if len(all_unexpected_keys) > 0:
+            unique_unexpected_keys = list(dict.fromkeys(all_unexpected_keys))
+            logger.warning(f"Unexpected keys while loading transformer weights: {unique_unexpected_keys[:20]}")
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def attention_kwargs(self):
+        return self._attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def current_timestep(self):
+        return self._current_timestep
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    @replace_example_docstring(EXAMPLE_DOC_STRING)
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        true_cfg_scale: float = 4.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 1.0,
+        num_images_per_prompt: int = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.Tensor] = None,
+        prompt_embeds: Optional[torch.Tensor] = None,
+        prompt_embeds_mask: Optional[torch.Tensor] = None,
+        negative_prompt_embeds: Optional[torch.Tensor] = None,
+        negative_prompt_embeds_mask: Optional[torch.Tensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        cond_image = None,
+        cond_bbox = None,
+        use_vlm = False,
+        tag_embedding = None,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                not greater than `1`).
+            true_cfg_scale (`float`, *optional*, defaults to 1.0):
+                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 3.5):
+                Guidance scale as defined in [Classifier-Free Diffusion
+                Guidance](https://huggingface.co/papers/2207.12598). `guidance_scale` is defined as `w` of equation 2.
+                of [Imagen Paper](https://huggingface.co/papers/2205.11487). Guidance scale is enabled by setting
+                `guidance_scale > 1`. Higher guidance scale encourages to generate images that are closely linked to
+                the text `prompt`, usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.Tensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will be generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.Tensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.qwenimage.QwenImagePipelineOutput`] instead of a plain tuple.
+            attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.qwenimage.QwenImagePipelineOutput`] or `tuple`:
+            [`~pipelines.qwenimage.QwenImagePipelineOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is a list with the generated images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            negative_prompt_embeds_mask=negative_prompt_embeds_mask,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._attention_kwargs = attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._get_module_input_device(self.transformer)
+        dtype = self.transformer.dtype
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_prompt_embeds_mask is not None
+        )
+        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        prompt_embeds, prompt_embeds_mask = self.encode_prompt(
+            prompt=prompt,
+            prompt_embeds=prompt_embeds,
+            prompt_embeds_mask=prompt_embeds_mask,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            image=cond_image if use_vlm else None,
+        )
+        if do_true_cfg:
+            negative_prompt_embeds, negative_prompt_embeds_mask = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_embeds=negative_prompt_embeds,
+                prompt_embeds_mask=negative_prompt_embeds_mask,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                image=cond_image if use_vlm else None,
+            )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # print("============")
+        # print(height)
+        # print("============")
+        # print(width)
+        img_shapes = [(1, height // self.vae_scale_factor // 2, width // self.vae_scale_factor // 2)] * batch_size
+        # 4.1 cond_image
+        if cond_image is not None:
+            cond_image_latent = self.image_processor.preprocess(cond_image, height, width)
+            cond_image_latent = cond_image_latent.to(device, dtype=dtype)
+            cond_image_latent = self.vae.encode(cond_image_latent.to(dtype=self.vae.dtype)[:, :, None]).latent_dist.sample()
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            cond_image_latent = (cond_image_latent - latents_mean) * latents_std
+            cond_image_latent = cond_image_latent.to(dtype=dtype)
+            height_cond_image_latent, width_cond_image_latent = cond_image_latent.shape[-2:]
+            if cond_bbox is None:
+                cond_image_latent = self._pack_latents(cond_image_latent, 1, 16, height_cond_image_latent, width_cond_image_latent)
+            else:
+                cond_image_latent = cond_image_latent.view(1, 16, height_cond_image_latent // 2, 2, width_cond_image_latent // 2, 2)
+                cond_image_latent = cond_image_latent.permute(0, 2, 4, 1, 3, 5)
+                x1, y1, x2, y2 = cond_bbox
+                cond_image_latent = cond_image_latent[:, y1:y2+1, x1:x2+1]
+                cond_image_latent = cond_image_latent.reshape(1, -1, 64)
+        else:
+            cond_image_latent = None
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        if self.attention_kwargs is None:
+            self._attention_kwargs = {}
+        # 6. Denoising loop
+        self.scheduler.set_begin_index(0)
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                condition_rotary_emb = None
+                if cond_image is not None:
+                    if cond_bbox is None:
+                        condition_rotary_emb = self.transformer.pos_embed.get_img_rope(
+                            [(1, height_cond_image_latent // 2, width_cond_image_latent // 2)],
+                            device=device,
+                            frame_idx=1,
+                        )
+                    else:
+                        condition_rotary_emb = self.transformer.pos_embed.get_img_rope_by_bbox([(1, height // 16, width // 16)], cond_bbox, device)
+                    joint_attention_kwargs = dict()
+                else:
+                    joint_attention_kwargs = dict()
+                with self.transformer.cache_context("cond"):
+                    noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        encoder_hidden_states_mask=prompt_embeds_mask,
+                        encoder_hidden_states=prompt_embeds,
+                        img_shapes=img_shapes,
+                        txt_seq_lens=prompt_embeds_mask.sum(dim=1).tolist(),
+                        attention_kwargs=joint_attention_kwargs,
+                        cond_hidden_states=cond_image_latent,
+                        cond_rope=condition_rotary_emb,
+                    ).sample
+                if do_true_cfg:
+                    with self.transformer.cache_context("uncond"):
+                        neg_noise_pred = self.transformer(
+                            hidden_states=latents,
+                            timestep=timestep / 1000,
+                            guidance=guidance,
+                            encoder_hidden_states_mask=negative_prompt_embeds_mask,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            img_shapes=img_shapes,
+                            txt_seq_lens=negative_prompt_embeds_mask.sum(dim=1).tolist(),
+                            attention_kwargs=joint_attention_kwargs,
+                            cond_hidden_states=cond_image_latent,
+                            cond_rope=condition_rotary_emb,
+                        ).sample
+                    comb_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                    cond_norm = torch.norm(noise_pred, dim=-1, keepdim=True)
+                    noise_norm = torch.norm(comb_pred, dim=-1, keepdim=True)
+                    noise_pred = comb_pred * (cond_norm / noise_norm)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = latents.to(self.vae.dtype)
+            latents_mean = (
+                torch.tensor(self.vae.config.latents_mean)
+                .view(1, self.vae.config.z_dim, 1, 1, 1)
+                .to(latents.device, latents.dtype)
+            )
+            latents_std = 1.0 / torch.tensor(self.vae.config.latents_std).view(1, self.vae.config.z_dim, 1, 1, 1).to(
+                latents.device, latents.dtype
+            )
+            latents = latents / latents_std + latents_mean
+            image = self.vae.decode(latents, return_dict=False)[0][:, :, 0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return QwenImagePipelineOutput(images=image)

src/utils/__init__.py ADDED Viewed

	@@ -0,0 +1,31 @@

+from src.utils.device_utils import (
+    build_accelerate_max_memory_map,
+    maybe_set_cuda_device_from_tensor,
+    parse_device_ids,
+    resolve_device_ids,
+)
+from src.utils.inference_config import (
+    DEFAULT_HEIGHT,
+    DEFAULT_NEGATIVE_PROMPT,
+    DEFAULT_NUM_INFERENCE_STEPS,
+    DEFAULT_SEED,
+    DEFAULT_TRUE_CFG_SCALE,
+    DEFAULT_WIDTH,
+    generate_random_seed,
+    normalize_negative_prompt,
+)
+__all__ = [
+    "DEFAULT_HEIGHT",
+    "DEFAULT_NEGATIVE_PROMPT",
+    "DEFAULT_NUM_INFERENCE_STEPS",
+    "DEFAULT_SEED",
+    "DEFAULT_TRUE_CFG_SCALE",
+    "DEFAULT_WIDTH",
+    "build_accelerate_max_memory_map",
+    "generate_random_seed",
+    "maybe_set_cuda_device_from_tensor",
+    "normalize_negative_prompt",
+    "parse_device_ids",
+    "resolve_device_ids",
+]

src/utils/device_utils.py ADDED Viewed

	@@ -0,0 +1,80 @@

+from __future__ import annotations
+from typing import Dict, Iterable, List, Mapping
+import torch
+def resolve_device_ids(device_arg: str | None) -> list[int] | None:
+    """Validate a user-provided device spec and return a list of GPU ids.
+    Returns ``None`` when *device_arg* is ``None`` (meaning "use framework
+    default"), an empty list for CPU, or a list of validated GPU indices.
+    """
+    if device_arg is None:
+        return None
+    device_ids = parse_device_ids(device_arg)
+    import torch as _torch
+    if len(device_ids) > 0 and not _torch.cuda.is_available():
+        raise ValueError("CUDA is not available, but GPU device ids were provided.")
+    if len(device_ids) == 0:
+        return []
+    device_count = _torch.cuda.device_count()
+    invalid_ids = [idx for idx in device_ids if idx < 0 or idx >= device_count]
+    if invalid_ids:
+        raise ValueError(
+            f"Invalid GPU ids {invalid_ids}. Available GPU ids: 0..{device_count - 1}."
+        )
+    return device_ids
+def parse_device_ids(device_arg: str) -> List[int]:
+    value = device_arg.strip().lower()
+    if not value:
+        raise ValueError("Device argument is empty.")
+    if value in {"cpu", "-1"}:
+        return []
+    device_ids = []
+    for part in value.split(","):
+        token = part.strip()
+        if not token:
+            raise ValueError(f"Invalid device list: {device_arg!r}")
+        device_ids.append(int(token))
+    return device_ids
+def build_accelerate_max_memory_map(
+    device_ids: Iterable[int],
+    free_bytes_by_device: Mapping[int, int],
+    reserve_bytes: int = 2 * 1024**3,
+) -> Dict[int, str]:
+    max_memory: Dict[int, str] = {}
+    for device_id in device_ids:
+        if device_id not in free_bytes_by_device:
+            raise ValueError(f"Missing free memory info for device {device_id}.")
+        free_bytes = free_bytes_by_device[device_id]
+        usable_gib = max(int((free_bytes - reserve_bytes) / (1024**3)), 4)
+        max_memory[device_id] = f"{usable_gib}GiB"
+    return max_memory
+def maybe_set_cuda_device_from_tensor(tensor) -> None:
+    if tensor is None:
+        return
+    if not torch.cuda.is_available():
+        return
+    if not getattr(tensor, "is_cuda", False):
+        return
+    device = getattr(tensor, "device", None)
+    device_index = getattr(device, "index", None)
+    if device_index is None:
+        return
+    if torch.cuda.current_device() == device_index:
+        return
+    torch.cuda.set_device(device_index)

src/utils/inference_config.py ADDED Viewed

	@@ -0,0 +1,19 @@

+import random
+DEFAULT_WIDTH = 1024
+DEFAULT_HEIGHT = 1024
+DEFAULT_SEED = -1
+DEFAULT_TRUE_CFG_SCALE = 4.0
+DEFAULT_NUM_INFERENCE_STEPS = 30
+DEFAULT_NEGATIVE_PROMPT = ""
+def normalize_negative_prompt(value: str | None) -> str:
+    if value is None or not str(value).strip():
+        return " "
+    return str(value)
+def generate_random_seed() -> int:
+    return random.randint(0, 2**32 - 1)