Spaces:

tri86uit
/

SynCD_base_FLUX

Runtime error

App Files Files Community

tri86uit commited on Jan 22

Commit

8d4cbac

verified ·

1 Parent(s): b5ae46d

Upload 15 files

Browse files

Files changed (16) hide show

.gitattributes +3 -0
README.md +15 -7
app.py +287 -0
imgs/test_cases/action_figure/0.jpg +0 -0
imgs/test_cases/action_figure/1.jpg +0 -0
imgs/test_cases/action_figure/2.jpg +0 -0
imgs/test_cases/penguin/0.jpg +0 -0
imgs/test_cases/penguin/1.jpg +0 -0
imgs/test_cases/penguin/2.jpg +0 -0
imgs/test_cases/rc_car/02.jpg +3 -0
imgs/test_cases/rc_car/03.jpg +3 -0
imgs/test_cases/rc_car/04.jpg +3 -0
models/pytorch_model.bin +3 -0
pipelines/flux_pipeline/pipeline.py +470 -0
pipelines/flux_pipeline/transformer.py +795 -0
requirements.txt +9 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+imgs/test_cases/rc_car/02.jpg filter=lfs diff=lfs merge=lfs -text
+imgs/test_cases/rc_car/03.jpg filter=lfs diff=lfs merge=lfs -text
+imgs/test_cases/rc_car/04.jpg filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,20 @@
 ---
-title: SynCD Base FLUX
-emoji: 🌖
-colorFrom: red
-colorTo: indigo
 sdk: gradio
-sdk_version: 6.3.0
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: SynCD
+emoji: 🖼
+colorFrom: purple
+colorTo: red
 sdk: gradio
+sdk_version: 5.17.1
 app_file: app.py
 pinned: false
+license: mit
+tags:
+  - dwpose
+  - pose
+  - Text-to-Image
+  - Image-to-Image
+  - language models
+  - LLMs
+short_description: Image generator/customization/personalization
 ---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,287 @@

+import os
+import random
+import gradio as gr
+import numpy as np
+import spaces
+import torch
+from einops import rearrange
+from huggingface_hub import login
+from peft import LoraConfig
+from PIL import Image
+from pipelines.flux_pipeline.pipeline import SynCDFluxPipeline
+from pipelines.flux_pipeline.transformer import FluxTransformer2DModelWithMasking
+HF_TOKEN = os.getenv('HF_TOKEN')
+login(token=HF_TOKEN)
+torch_dtype = torch.bfloat16
+transformer = FluxTransformer2DModelWithMasking.from_pretrained(
+            'black-forest-labs/FLUX.1-dev',
+            subfolder='transformer',
+            torch_dtype=torch_dtype
+        )
+pipeline = SynCDFluxPipeline.from_pretrained('black-forest-labs/FLUX.1-dev', transformer=transformer, torch_dtype=torch_dtype)
+for name, attn_proc in pipeline.transformer.attn_processors.items():
+    attn_proc.name = name
+target_modules=[
+                "to_k",
+                "to_q",
+                "to_v",
+                "add_k_proj",
+                "add_q_proj",
+                "add_v_proj",
+                "to_out.0",
+                "to_add_out",
+                "ff.net.0.proj",
+                "ff.net.2",
+                "ff_context.net.0.proj",
+                "ff_context.net.2",
+                "proj_mlp",
+                "proj_out",
+                ]
+lora_rank = 32
+lora_config = LoraConfig(
+    r=lora_rank,
+    lora_alpha=lora_rank,
+    init_lora_weights="gaussian",
+    target_modules=target_modules,
+)
+pipeline.transformer.add_adapter(lora_config)
+finetuned_path = torch.load('models/pytorch_model.bin', map_location='cpu')
+transformer_dict = {}
+for key,value in finetuned_path.items():
+    if 'transformer.base_model.model.' in key:
+        transformer_dict[key.replace('transformer.base_model.model.', '')] = value
+pipeline.transformer.load_state_dict(transformer_dict, strict=False)
+pipeline.to('cuda')
+pipeline.enable_vae_slicing()
+pipeline.enable_vae_tiling()
+@torch.no_grad()
+def decode(latents, pipeline):
+    latents = latents / pipeline.vae.config.scaling_factor
+    image = pipeline.vae.decode(latents, return_dict=False)[0]
+    return image
+@torch.no_grad()
+def encode_target_images(images, pipeline):
+    latents = pipeline.vae.encode(images).latent_dist.sample()
+    latents = latents * pipeline.vae.config.scaling_factor
+    return latents
+@spaces.GPU(duration=120)
+def generate_image(text, img1, img2, img3, guidance_scale, inference_steps, seed, enable_cpu_offload=False, neg_prompt="", true_cfg=1.0, image_cfg=0.0):
+    if neg_prompt == "":
+        neg_prompt = "3d render, cartoon, low resolution, illustration, blurry, unrealistic"
+    if enable_cpu_offload:
+        pipeline.enable_sequential_cpu_offload()
+    input_images = [img1, img2, img3]
+    # Delete None
+    input_images = [img for img in input_images if img is not None]
+    if len(input_images) == 0:
+        return "Please upload at least one image"
+    numref = len(input_images) + 1
+    images = torch.cat([2. * torch.from_numpy(np.array(Image.open(img).convert('RGB').resize((512, 512)))).permute(2, 0, 1).unsqueeze(0).to(torch_dtype)/255. -1. for img in input_images])
+    images = images.to(pipeline.device)
+    latents = encode_target_images(images, pipeline)
+    latents = torch.cat([torch.zeros_like(latents[:1]), latents], dim=0)
+    masklatent = torch.zeros_like(latents)
+    masklatent[:1] = 1.
+    latents = rearrange(latents, "(b n) c h w -> b c h (n w)", n=numref)
+    masklatent = rearrange(masklatent, "(b n) c h w -> b c h (n w)", n=numref)
+    B, C, H, W = latents.shape
+    latents = pipeline._pack_latents(latents, B, C, H, W)
+    masklatent = pipeline._pack_latents(masklatent.expand(-1, C, -1, -1) ,B, C, H, W)
+    output = pipeline(
+        text,
+        latents_ref=latents,
+        latents_mask=masklatent,
+        guidance_scale=guidance_scale,
+        num_inference_steps=inference_steps,
+        height=512,
+        width=numref * 512,
+        generator = torch.Generator(device="cuda").manual_seed(seed),
+        joint_attention_kwargs={'shared_attn': True, 'num': numref},
+        return_dict=False,
+        negative_prompt=neg_prompt,
+        true_cfg_scale=true_cfg,
+        image_cfg_scale=image_cfg,
+    )[0][0]
+    output = rearrange(output, "b c h (n w) -> (b n) c h w", n=numref)[::numref]
+    img = Image.fromarray( (( torch.clip(output[0].float(), -1., 1.).permute(1,2,0).cpu().numpy()*0.5+0.5)*255).astype(np.uint8) )
+    return img
+def get_example():
+    case = [
+        [
+            "An action figure on top of a mountain. Sunset in the background. Realistic shot.",
+            "./imgs/test_cases/action_figure/0.jpg",
+            "./imgs/test_cases/action_figure/1.jpg",
+            "./imgs/test_cases/action_figure/2.jpg",
+            3.5,
+            42,
+            False,
+            "",
+            1.0,
+            0.0,
+        ],
+        [
+            "A penguin plushie wearing pink sunglasses is lounging on a beach. Realistic shot.",
+            "./imgs/test_cases/penguin/0.jpg",
+            "./imgs/test_cases/penguin/1.jpg",
+            "./imgs/test_cases/penguin/2.jpg",
+            3.5,
+            42,
+            False,
+            "",
+            1.0,
+            0.0,
+        ],
+        [
+            "A toy on a beach. Waves in the background. Realistic shot.",
+            "./imgs/test_cases/rc_car/02.jpg",
+            "./imgs/test_cases/rc_car/03.jpg",
+            "./imgs/test_cases/rc_car/04.jpg",
+            3.5,
+            42,
+            False,
+            "",
+            1.0,
+            0.0,
+        ],
+    ]
+    return case
+def run_for_examples(text, img1, img2, img3, guidance_scale, seed, enable_cpu_offload=False, neg_prompt="", true_cfg=1.0, image_cfg=0.0):
+    inference_steps = 30
+    return generate_image(
+        text, img1, img2, img3, guidance_scale, inference_steps, seed, enable_cpu_offload, neg_prompt, true_cfg, image_cfg
+    )
+description = """
+Synthetic Customization Dataset (SynCD) consists of multiple images of the same object in different contexts. We achieve it by promoting similar object identity using either explicit 3D object assets or, more implicitly, using masked shared attention across different views while generating images. Given this training data, we train a new encoder-based model for the task, which can successfully generate new compositions of a reference object using text prompts. You can download our dataset [here](https://huggingface.co/datasets/nupurkmr9/syncd).
+Our model supports multiple input images of the same object as references. You can upload up to 3 images, with better results on 3 images vs 1 image.
+**HF Spaces often encounter errors due to quota limitations, so recommend to run it locally.**
+"""
+article = """
+---
+**Citation**
+<br>
+If you find this repository useful, please consider giving a star ⭐ and a citation
+```
+@article{kumari2025syncd,
+        title={Generating Multi-Image Synthetic Data for Text-to-Image Customization},
+        author={Kumari, Nupur and Yin, Xi and Zhu, Jun-Yan and Misra, Ishan and Azadi, Samaneh},
+        journal={ArXiv},
+        year={2025}
+      }
+```
+**Contact**
+<br>
+If you have any questions, please feel free to open an issue or directly reach us out via email.
+**Acknowledgement**
+<br>
+This space was modified from [OmniGen](https://huggingface.co/spaces/Shitao/OmniGen) space.
+"""
+# Gradio
+with gr.Blocks() as demo:
+    gr.Markdown("# SynCD: Generating Multi-Image Synthetic Data for Text-to-Image Customization [[paper](https://arxiv.org/abs/2502.01720)] [[code](https://github.com/nupurkmr9/syncd)]")
+    gr.Markdown(description)
+    with gr.Row():
+        with gr.Column():
+            # text prompt
+            prompt_input = gr.Textbox(
+                label="Enter your prompt, more descriptive prompt will lead to better results", placeholder="Type your prompt here..."
+            )
+            with gr.Row(equal_height=True):
+                # input images
+                image_input_1 = gr.Image(label="img1", type="filepath")
+                image_input_2 = gr.Image(label="img2", type="filepath")
+                image_input_3 = gr.Image(label="img3", type="filepath")
+            guidance_scale_input = gr.Slider(
+                label="Guidance Scale", minimum=1.0, maximum=5.0, value=3.5, step=0.1
+            )
+            num_inference_steps = gr.Slider(
+                label="Inference Steps", minimum=1, maximum=100, value=30, step=1
+            )
+            seed_input = gr.Slider(
+                label="Seed", minimum=0, maximum=2147483647, value=42, step=1
+            )
+            enable_cpu_offload = gr.Checkbox(
+                label="Enable CPU Offload", info="Enable CPU Offload to avoid memory issues", value=False,
+            )
+            with gr.Accordion("Advanced Options (True CFG, true_cfg_scale=1 means use fake CFG, >1 means use true CFG", open=False):    # noqa E501
+                neg_prompt = gr.Textbox(
+                    label="Negative Prompt",
+                    value="")
+                true_cfg = gr.Slider(1.0, 10.0, 1.5, step=0.1, label="true CFG. Recommended to be 1.5")
+                image_cfg = gr.Slider(0.0, 10.0, 0.0, step=0.1, label="image CFG scale, will increase the image alignment but longer run time and lower text alignment. Recommended to be 1.0")
+            # generate
+            generate_button = gr.Button("Generate Image")
+        with gr.Column():
+            # output image
+            output_image = gr.Image(label="Output Image")
+    # click
+    generate_button.click(
+        generate_image,
+        inputs=[
+            prompt_input,
+            image_input_1,
+            image_input_2,
+            image_input_3,
+            guidance_scale_input,
+            num_inference_steps,
+            seed_input,
+            enable_cpu_offload,
+            neg_prompt,
+            true_cfg,
+            image_cfg,
+        ],
+        outputs=output_image,
+    )
+    gr.Examples(
+        examples=get_example(),
+        fn=run_for_examples,
+        inputs=[
+            prompt_input,
+            image_input_1,
+            image_input_2,
+            image_input_3,
+            guidance_scale_input,
+            seed_input,
+            enable_cpu_offload,
+            neg_prompt,
+            true_cfg,
+            image_cfg,
+        ],
+        outputs=output_image,
+    )
+    gr.Markdown(article)
+# launch
+demo.launch(ssr_mode=False)

imgs/test_cases/action_figure/0.jpg ADDED Viewed

imgs/test_cases/action_figure/1.jpg ADDED Viewed

imgs/test_cases/action_figure/2.jpg ADDED Viewed

imgs/test_cases/penguin/0.jpg ADDED Viewed

imgs/test_cases/penguin/1.jpg ADDED Viewed

imgs/test_cases/penguin/2.jpg ADDED Viewed

imgs/test_cases/rc_car/02.jpg ADDED Viewed

Git LFS Details

SHA256: 5306db6234529b9d6d41357724e901791e8f20024d9acee1582b250c599e035b
Pointer size: 131 Bytes
Size of remote file: 238 kB

imgs/test_cases/rc_car/03.jpg ADDED Viewed

Git LFS Details

SHA256: ed3f563a4ec793e34be5ce3810d8c8da3e42c5d4f2fa3e3f4159f6b05ab4307a
Pointer size: 131 Bytes
Size of remote file: 312 kB

imgs/test_cases/rc_car/04.jpg ADDED Viewed

Git LFS Details

SHA256: 46376e08104f686383f04c15e2a5d8341c02afefd2e544bd043d13d7cbc26bc8
Pointer size: 131 Bytes
Size of remote file: 407 kB

models/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0de7be527b2bf604f679a8c4a0545af4a371e6559aff8bfa28f2a47510872da9
+size 134

pipelines/flux_pipeline/pipeline.py ADDED Viewed

	@@ -0,0 +1,470 @@

+# Copyright 2024 Black Forest Labs and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from diffusers import FluxPipeline
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.models.transformers import FluxTransformer2DModel
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import is_torch_xla_available
+from transformers import (
+    CLIPImageProcessor,
+    CLIPTextModel,
+    CLIPTokenizer,
+    CLIPVisionModelWithProjection,
+    T5EncoderModel,
+    T5TokenizerFast,
+)
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+def calculate_shift(
+    image_seq_len,
+    base_seq_len: int = 256,
+    max_seq_len: int = 4096,
+    base_shift: float = 0.5,
+    max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+    scheduler,
+    num_inference_steps: Optional[int] = None,
+    device: Optional[Union[str, torch.device]] = None,
+    timesteps: Optional[List[int]] = None,
+    sigmas: Optional[List[float]] = None,
+    **kwargs,):
+    if timesteps is not None and sigmas is not None:
+        raise ValueError(
+            "Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values"
+        )
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(
+            inspect.signature(scheduler.set_timesteps).parameters.keys()
+        )
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+def normalized_guidance_image(neg_noise_pred, noise_pred, image_noise_pred, true_cfg_scale, image_cfg_scale):
+    diff_img = image_noise_pred - neg_noise_pred
+    diff_txt = noise_pred - image_noise_pred
+    diff_norm_txt = diff_txt.norm(p=2, dim=[-1, -2], keepdim=True)
+    diff_norm_img = diff_img.norm(p=2, dim=[-1, -2], keepdim=True)
+    min_norm = torch.minimum(diff_norm_img, diff_norm_txt)
+    diff_txt = diff_txt * torch.minimum(torch.ones_like(diff_txt), min_norm / diff_norm_txt)
+    diff_img = diff_img * torch.minimum(torch.ones_like(diff_txt), min_norm / diff_norm_img)
+    pred_guided = image_noise_pred + image_cfg_scale * diff_img + true_cfg_scale * diff_txt
+    return pred_guided
+class SynCDFluxPipeline(FluxPipeline):
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+        self,
+        scheduler: FlowMatchEulerDiscreteScheduler,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        text_encoder_2: T5EncoderModel,
+        tokenizer_2: T5TokenizerFast,
+        transformer: FluxTransformer2DModel,
+        image_encoder: CLIPVisionModelWithProjection = None,
+        feature_extractor: CLIPImageProcessor = None,
+        ###
+        num=2,
+    ):
+        super().__init__(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+            image_encoder=image_encoder,
+            feature_extractor=feature_extractor
+        )
+        self.default_sample_size = 64
+        self.num = num
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]] = None,
+        prompt_2: Optional[Union[str, List[str]]] = None,
+        negative_prompt: Union[str, List[str]] = None,
+        negative_prompt_2: Optional[Union[str, List[str]]] = None,
+        true_cfg_scale: float = 1.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 28,
+        sigmas: Optional[List[float]] = None,
+        guidance_scale: float = 3.5,
+        num_images_per_prompt: Optional[int] = 1,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        prompt_embeds: Optional[torch.FloatTensor] = None,
+        pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_prompt_embeds: Optional[torch.FloatTensor] = None,
+        negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+        callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+        max_sequence_length: int = 512,
+        #####
+        latents_ref: Optional[torch.Tensor] = None,
+        latents_mask: Optional[torch.Tensor] = None,
+        return_latents: bool = False,
+        image_cfg_scale: float = 0.0,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts to guide the image generation. If not defined, one has to pass `prompt_embeds`.
+                instead.
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                will be used instead.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. If not defined, one has to pass
+                `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `true_cfg_scale` is
+                not greater than `1`).
+            negative_prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation to be sent to `tokenizer_2` and
+                `text_encoder_2`. If not defined, `negative_prompt` is used in all the text-encoders.
+            true_cfg_scale (`float`, *optional*, defaults to 1.0):
+                When > 1.0 and a provided `negative_prompt`, enables true classifier-free guidance.
+            height (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The height in pixels of the generated image. This is set to 1024 by default for the best results.
+            width (`int`, *optional*, defaults to self.unet.config.sample_size * self.vae_scale_factor):
+                The width in pixels of the generated image. This is set to 1024 by default for the best results.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            sigmas (`List[float]`, *optional*):
+                Custom sigmas to use for the denoising process with schedulers which support a `sigmas` argument in
+                their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is passed
+                will be used.
+            guidance_scale (`float`, *optional*, defaults to 7.0):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            generator (`torch.Generator` or `List[torch.Generator]`, *optional*):
+                One or a list of [torch generator(s)](https://pytorch.org/docs/stable/generated/torch.Generator.html)
+                to make generation deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            negative_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input
+                argument.
+            negative_pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated negative pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt
+                weighting. If not provided, pooled negative_prompt_embeds will be generated from `negative_prompt`
+                input argument.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.flux.FluxPipelineOutput`] instead of a plain tuple.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            callback_on_step_end (`Callable`, *optional*):
+                A function that calls at the end of each denoising steps during the inference. The function is called
+                with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int,
+                callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by
+                `callback_on_step_end_tensor_inputs`.
+            callback_on_step_end_tensor_inputs (`List`, *optional*):
+                The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list
+                will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the
+                `._callback_tensor_inputs` attribute of your pipeline class.
+            max_sequence_length (`int` defaults to 512): Maximum sequence length to use with the `prompt`.
+        Examples:
+        Returns:
+            [`~pipelines.flux.FluxPipelineOutput`] or `tuple`: [`~pipelines.flux.FluxPipelineOutput`] if `return_dict`
+            is True, otherwise a `tuple`. When returning a tuple, the first element is a list with the generated
+            images.
+        """
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            negative_prompt=negative_prompt,
+            negative_prompt_2=negative_prompt_2,
+            prompt_embeds=prompt_embeds,
+            negative_prompt_embeds=negative_prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._current_timestep = None
+        self._interrupt = False
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        has_neg_prompt = negative_prompt is not None or (
+            negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
+        )
+        do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        if do_true_cfg:
+            (
+                negative_prompt_embeds,
+                negative_pooled_prompt_embeds,
+                _,
+            ) = self.encode_prompt(
+                prompt=negative_prompt,
+                prompt_2=negative_prompt_2,
+                prompt_embeds=negative_prompt_embeds,
+                pooled_prompt_embeds=negative_pooled_prompt_embeds,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                lora_scale=lora_scale,
+            )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4
+        latents, latent_image_ids = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            latents,
+        )
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.get("base_image_seq_len", 256),
+            self.scheduler.config.get("max_image_seq_len", 4096),
+            self.scheduler.config.get("base_shift", 0.5),
+            self.scheduler.config.get("max_shift", 1.15),
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            sigmas=sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        if self.joint_attention_kwargs is None:
+            self._joint_attention_kwargs = {}
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                self._current_timestep = t
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                self.joint_attention_kwargs.update({'timestep': t/1000})
+                if self.joint_attention_kwargs is not None and self.joint_attention_kwargs['shared_attn'] and latents_ref is not None and latents_mask is not None:
+                    latents = (1 - latents_mask) * latents_ref + latents_mask * latents
+                noise_pred = self.transformer(
+                    hidden_states=latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                if do_true_cfg and i>=1:
+                    neg_noise_pred = self.transformer(
+                        hidden_states=latents,
+                        timestep=timestep / 1000,
+                        guidance=guidance,
+                        pooled_projections=negative_pooled_prompt_embeds,
+                        encoder_hidden_states=negative_prompt_embeds,
+                        txt_ids=text_ids,
+                        img_ids=latent_image_ids,
+                        joint_attention_kwargs={**self.joint_attention_kwargs, 'neg_mode': True},
+                        return_dict=False,
+                    )[0]
+                    if image_cfg_scale > 0:
+                        image_noise_pred = self.transformer(
+                            hidden_states=latents,
+                            timestep=timestep / 1000,
+                            guidance=guidance,
+                            pooled_projections=negative_pooled_prompt_embeds,
+                            encoder_hidden_states=negative_prompt_embeds,
+                            txt_ids=text_ids,
+                            img_ids=latent_image_ids,
+                            joint_attention_kwargs=self.joint_attention_kwargs,
+                            return_dict=False,
+                        )[0]
+                    if image_cfg_scale == 0:
+                        noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)
+                    else:
+                        noise_pred = normalized_guidance_image(neg_noise_pred, noise_pred, image_noise_pred, true_cfg_scale, image_cfg_scale)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        self._current_timestep = None
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents, return_dict=False)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        return (image,)

pipelines/flux_pipeline/transformer.py ADDED Viewed

	@@ -0,0 +1,795 @@

+# https://github.com/bghira/SimpleTuner/blob/d0b5f37913a80aabdb0cac893937072dfa3e6a4b/helpers/models/flux/transformer.py#L404
+# Copyright 2024 Stability AI, The HuggingFace Team, The InstantX Team, and Terminus Research Group. All rights reserved.
+#
+# Originally licensed under the Apache License, Version 2.0 (the "License");
+# Updated to "Affero GENERAL PUBLIC LICENSE Version 3, 19 November 2007" via extensive updates to attn_mask usage.
+import math
+from contextlib import contextmanager
+from typing import Any, Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import Attention, AttentionProcessor
+from diffusers.models.embeddings import (
+    CombinedTimestepGuidanceTextProjEmbeddings,
+    CombinedTimestepTextProjEmbeddings,
+    FluxPosEmbed,
+)
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import (
+    AdaLayerNormContinuous,
+    AdaLayerNormZero,
+    AdaLayerNormZeroSingle,
+)
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_version,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from einops import rearrange
+from peft.tuners.lora.layer import LoraLayer
+# Import flex_attention for optimized attention with fixed masks
+try:
+    from torch.nn.attention.flex_attention import flex_attention, create_block_mask
+    FLEX_ATTENTION_AVAILABLE = True
+except ImportError:
+    FLEX_ATTENTION_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+flex_attention_func = None
+block_mask = None
+class FluxAttnProcessor2_0:
+    """Attention processor used typically in processing the SD3-like self-attention projections."""
+    def __init__(self):
+        if not hasattr(F, "scaled_dot_product_attention"):
+            raise ImportError(
+                "FluxAttnProcessor2_0 requires PyTorch 2.0, to use it, please upgrade PyTorch to 2.0."
+            )
+        self.name = None
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        shared_attn: bool = False, num=2,
+        scale: float = 1.0,
+        timestep: float = 0,
+        neg_mode: bool = False,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        end_of_hidden_states = hidden_states.shape[1]
+        text_seq = 512
+        mask = None
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # the attention in FluxSingleTransformerBlock does not use `encoder_hidden_states`
+        if encoder_hidden_states is not None:
+            # `context` projections.
+            encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+            encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+            encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+            encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+            if attn.norm_added_q is not None:
+                encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+            if attn.norm_added_k is not None:
+                encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+            # attention
+            query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+            key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+            value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb).to(hidden_states.dtype)
+            key = apply_rotary_emb(key, image_rotary_emb).to(hidden_states.dtype)
+        if neg_mode and FLEX_ATTENTION_AVAILABLE:
+            # Apply flex_attention with the block mask
+            global block_mask
+            need_new_mask = block_mask is None
+            if need_new_mask:
+                res = int(math.sqrt((end_of_hidden_states-(text_seq if encoder_hidden_states is None else 0)) // num))
+                seq_len = query.shape[2]
+                def block_diagonal_mask(b, h, q_idx, kv_idx):
+                    text_offset = 512
+                    # Text tokens (first 512) can attend to everything
+                    # Use tensor operations instead of if statements
+                    is_text = (q_idx < text_offset) | (kv_idx < text_offset)
+                    # For spatial tokens, compute which block they belong to
+                    q_spatial = q_idx - text_offset
+                    kv_spatial = kv_idx - text_offset
+                    # Determine block indices
+                    q_block = (q_spatial // res) % num
+                    kv_block = (kv_spatial // res) % num
+                    # Only attend within the same block
+                    same_block = (q_block == kv_block)
+                    # Return: text can attend to everything OR same block
+                    return is_text | same_block
+                # Create block mask for efficiency
+                block_mask = create_block_mask(block_diagonal_mask, B=1, H=None,
+                                                Q_LEN=seq_len, KV_LEN=seq_len, device=query.device)
+            hidden_states = flex_attention(query, key, value, block_mask=block_mask)
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        elif neg_mode:
+            # Fallback to original implementation if flex_attention is not available
+            res = int(math.sqrt((end_of_hidden_states-(text_seq if encoder_hidden_states is None else 0)) // num))
+            hw = res*res
+            mask_ = torch.zeros(1, res, num*res, res, num*res).to(query.device)
+            for i in range(num):
+                mask_[:, :, i*res:(i+1)*res, :, i*res:(i+1)*res] = 1
+            mask_ = rearrange(mask_, "b h w h1 w1 -> b (h w) (h1 w1)")
+            mask = torch.ones(1, num*hw + 512, num*hw + 512, device=query.device, dtype=query.dtype)
+            mask[:, 512:, 512:] = mask_
+            mask = mask.bool()
+            mask = rearrange(mask.unsqueeze(0).expand(attn.heads, -1, -1, -1), "nh b ... -> b nh ...")
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=mask)
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        else:
+            # No masking needed
+            hidden_states = F.scaled_dot_product_attention(query, key, value, dropout_p=0.0, is_causal=False, attn_mask=None)
+            hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if encoder_hidden_states is not None:
+            encoder_hidden_states, hidden_states = (
+                hidden_states[:, : encoder_hidden_states.shape[1]],
+                hidden_states[:, encoder_hidden_states.shape[1]:],
+            )
+            hidden_states = hidden_states[:, :end_of_hidden_states]
+            # linear proj
+            hidden_states = attn.to_out[0](hidden_states)
+            # dropout
+            hidden_states = attn.to_out[1](hidden_states)
+            encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+            return hidden_states, encoder_hidden_states
+        else:
+            return hidden_states[:, :end_of_hidden_states]
+def expand_flux_attention_mask(
+    hidden_states: torch.Tensor,
+    attn_mask: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Expand a mask so that the image is included.
+    """
+    bsz = attn_mask.shape[0]
+    assert bsz == hidden_states.shape[0]
+    residual_seq_len = hidden_states.shape[1]
+    mask_seq_len = attn_mask.shape[1]
+    expanded_mask = torch.ones(bsz, residual_seq_len)
+    expanded_mask[:, :mask_seq_len] = attn_mask
+    return expanded_mask
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        processor = FluxAttnProcessor2_0()
+        # processor = FluxSingleAttnProcessor3_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ):
+        dtype = hidden_states.dtype
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states.to(dtype),
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    def __init__(
+        self, dim, num_attention_heads, attention_head_dim, qk_norm="rms_norm", eps=1e-6
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(
+            dim=dim, dim_out=dim, activation_fn="gelu-approximate"
+        )
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+        self,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor,
+        temb: torch.FloatTensor,
+        image_rotary_emb=None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None
+    ):
+        dtype = hidden_states.dtype
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = (self.norm1_context(encoder_hidden_states, emb=temb))
+        # Attention.
+        attn_output, context_attn_output = self.attn(
+            hidden_states=norm_hidden_states.to(dtype),
+            encoder_hidden_states=norm_encoder_hidden_states.to(dtype),
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = (norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None])
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = (
+            norm_encoder_hidden_states * (1 + c_scale_mlp[:, None])
+            + c_shift_mlp[:, None]
+        )
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = (
+            encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        )
+        return encoder_hidden_states, hidden_states
+@contextmanager
+def set_adapter_scale(model, alpha):
+    original_scaling = {}
+    for module in model.modules():
+        if isinstance(module, LoraLayer):
+            original_scaling[module] = module.scaling.copy()
+            module.scaling = {k: v * alpha for k, v in module.scaling.items()}
+    # check whether scaling is prohibited on model
+    # the original scaling dictionary should be empty
+    # if there were no lora layers
+    if not original_scaling:
+        raise ValueError("scaling is only supported for models with `LoraLayer`s")
+    try:
+        yield
+    finally:
+        # restore original scaling values after exiting the context
+        for module, scaling in original_scaling.items():
+            module.scaling = scaling
+class FluxTransformer2DModelWithMasking(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin
+):
+    """
+    The Transformer model introduced in Flux.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Parameters:
+        patch_size (`int`): Patch size to turn the input data into small patches.
+        in_channels (`int`, *optional*, defaults to 16): The number of channels in the input.
+        num_layers (`int`, *optional*, defaults to 18): The number of layers of MMDiT blocks to use.
+        num_single_layers (`int`, *optional*, defaults to 18): The number of layers of single DiT blocks to use.
+        attention_head_dim (`int`, *optional*, defaults to 64): The number of channels in each head.
+        num_attention_heads (`int`, *optional*, defaults to 18): The number of heads to use for multi-head attention.
+        joint_attention_dim (`int`, *optional*): The number of `encoder_hidden_states` dimensions to use.
+        pooled_projection_dim (`int`): Number of dimensions to use when projecting the `pooled_projections`.
+        guidance_embeds (`bool`, defaults to False): Whether to use guidance embeddings.
+    """
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+        ##
+    ):
+        super().__init__()
+        self.out_channels = in_channels
+        self.inner_dim = (
+            self.config.num_attention_heads * self.config.attention_head_dim
+        )
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings
+            if guidance_embeds
+            else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim,
+            pooled_projection_dim=self.config.pooled_projection_dim,
+        )
+        self.context_embedder = nn.Linear(
+            self.config.joint_attention_dim, self.inner_dim
+        )
+        self.x_embedder = torch.nn.Linear(self.config.in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=self.config.num_attention_heads,
+                    attention_head_dim=self.config.attention_head_dim,
+                )
+                for i in range(self.config.num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(
+            self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6
+        )
+        self.proj_out = nn.Linear(
+            self.inner_dim, patch_size * patch_size * self.out_channels, bias=True
+        )
+        self.gradient_checkpointing = False
+    @property
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        return_dict: bool = True,
+    ) -> Union[torch.FloatTensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModelWithMasking`] forward method.
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch size, channel, height, width)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch size, sequence_len, embed_dims)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.FloatTensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if (
+                joint_attention_kwargs is not None
+                and joint_attention_kwargs.get("scale", None) is not None
+            ):
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0).to(hidden_states.dtype)
+        image_rotary_emb = self.pos_embed(ids)
+        for index_block, block in enumerate(self.transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                encoder_hidden_states, hidden_states = (
+                    torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(block),
+                        hidden_states,
+                        encoder_hidden_states,
+                        temb,
+                        image_rotary_emb,
+                        joint_attention_kwargs,
+                        **ckpt_kwargs,
+                    )
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+        # Flux places the text tokens in front of the image tokens in the
+        # sequence.
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = (
+                    {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                )
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    joint_attention_kwargs,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1]:, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)
+if __name__ == "__main__":
+    dtype = torch.bfloat16
+    bsz = 2
+    img = torch.rand((bsz, 16, 64, 64)).to("cuda", dtype=dtype)
+    timestep = torch.tensor([0.5, 0.5]).to("cuda", dtype=torch.float32)
+    pooled = torch.rand(bsz, 768).to("cuda", dtype=dtype)
+    text = torch.rand((bsz, 512, 4096)).to("cuda", dtype=dtype)
+    attn_mask = torch.tensor([[1.0] * 384 + [0.0] * 128] * bsz).to(
+        "cuda", dtype=dtype
+    )  # Last 128 positions are masked
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(
+            batch_size, num_channels_latents, height // 2, 2, width // 2, 2
+        )
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(
+            batch_size, (height // 2) * (width // 2), num_channels_latents * 4
+        )
+        return latents
+    def _prepare_latent_image_ids(
+        batch_size, height, width, device="cuda", dtype=dtype
+    ):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = (
+            latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        )
+        latent_image_ids[..., 2] = (
+            latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        )
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = (
+            latent_image_ids.shape
+        )
+        latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
+        latent_image_ids = latent_image_ids.reshape(
+            batch_size,
+            latent_image_id_height * latent_image_id_width,
+            latent_image_id_channels,
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    txt_ids = torch.zeros(bsz, text.shape[1], 3).to(device="cuda", dtype=dtype)
+    vae_scale_factor = 16
+    height = 2 * (int(512) // vae_scale_factor)
+    width = 2 * (int(512) // vae_scale_factor)
+    img_ids = _prepare_latent_image_ids(bsz, height, width)
+    img = _pack_latents(img, img.shape[0], 16, height, width)
+    # Gotta go fast
+    transformer = FluxTransformer2DModelWithMasking.from_config(
+        {
+            "attention_head_dim": 128,
+            "guidance_embeds": True,
+            "in_channels": 64,
+            "joint_attention_dim": 4096,
+            "num_attention_heads": 24,
+            "num_layers": 4,
+            "num_single_layers": 8,
+            "patch_size": 1,
+            "pooled_projection_dim": 768,
+        }
+    ).to("cuda", dtype=dtype)
+    guidance = torch.tensor([2.0], device="cuda")
+    guidance = guidance.expand(bsz)
+    with torch.no_grad():
+        no_mask = transformer(
+            img,
+            encoder_hidden_states=text,
+            pooled_projections=pooled,
+            timestep=timestep,
+            img_ids=img_ids,
+            txt_ids=txt_ids,
+            guidance=guidance,
+        )
+        mask = transformer(
+            img,
+            encoder_hidden_states=text,
+            pooled_projections=pooled,
+            timestep=timestep,
+            img_ids=img_ids,
+            txt_ids=txt_ids,
+            guidance=guidance,
+            attention_mask=attn_mask,
+        )
+    assert torch.allclose(no_mask.sample, mask.sample) is False
+    print("Attention masking test ran OK. Differences in output were detected.")

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+diffusers
+torch
+transformers
+peft
+einops
+numpy
+Pillow
+sentencepiece
+huggingface_hub