build-tools / diffusers /modular_pipelines /flux2 /modular_blocks_flux2.py
salmankhanpm's picture
Add files using upload-large-folder tool
4f4376a verified
# Copyright 2025 The HuggingFace Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from ...utils import logging
from ..modular_pipeline import AutoPipelineBlocks, SequentialPipelineBlocks
from ..modular_pipeline_utils import InsertableDict, OutputParam
from .before_denoise import (
Flux2PrepareGuidanceStep,
Flux2PrepareImageLatentsStep,
Flux2PrepareLatentsStep,
Flux2RoPEInputsStep,
Flux2SetTimestepsStep,
)
from .decoders import Flux2DecodeStep, Flux2UnpackLatentsStep
from .denoise import Flux2DenoiseStep
from .encoders import (
Flux2TextEncoderStep,
Flux2VaeEncoderStep,
)
from .inputs import (
Flux2ProcessImagesInputStep,
Flux2TextInputStep,
)
logger = logging.get_logger(__name__) # pylint: disable=invalid-name
# auto_docstring
class Flux2VaeEncoderSequentialStep(SequentialPipelineBlocks):
"""
VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning.
Components:
image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
Inputs:
image (`None`, *optional*):
TODO: Add description.
height (`None`, *optional*):
TODO: Add description.
width (`None`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
Outputs:
condition_images (`list`):
TODO: Add description.
image_latents (`list`):
List of latent representations for each reference image
"""
model_name = "flux2"
block_classes = [Flux2ProcessImagesInputStep(), Flux2VaeEncoderStep()]
block_names = ["preprocess", "encode"]
@property
def description(self) -> str:
return "VAE encoder step that preprocesses, encodes, and prepares image latents for Flux2 conditioning."
# auto_docstring
class Flux2AutoVaeEncoderStep(AutoPipelineBlocks):
"""
VAE encoder step that encodes the image inputs into their latent representations.
This is an auto pipeline block that works for image conditioning tasks.
- `Flux2VaeEncoderSequentialStep` is used when `image` is provided.
- If `image` is not provided, step will be skipped.
Components:
image_processor (`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`)
Inputs:
image (`None`, *optional*):
TODO: Add description.
height (`None`, *optional*):
TODO: Add description.
width (`None`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
Outputs:
condition_images (`list`):
TODO: Add description.
image_latents (`list`):
List of latent representations for each reference image
"""
block_classes = [Flux2VaeEncoderSequentialStep]
block_names = ["img_conditioning"]
block_trigger_inputs = ["image"]
@property
def description(self):
return (
"VAE encoder step that encodes the image inputs into their latent representations.\n"
"This is an auto pipeline block that works for image conditioning tasks.\n"
" - `Flux2VaeEncoderSequentialStep` is used when `image` is provided.\n"
" - If `image` is not provided, step will be skipped."
)
Flux2CoreDenoiseBlocks = InsertableDict(
[
("input", Flux2TextInputStep()),
("prepare_latents", Flux2PrepareLatentsStep()),
("set_timesteps", Flux2SetTimestepsStep()),
("prepare_guidance", Flux2PrepareGuidanceStep()),
("prepare_rope_inputs", Flux2RoPEInputsStep()),
("denoise", Flux2DenoiseStep()),
("after_denoise", Flux2UnpackLatentsStep()),
]
)
# auto_docstring
class Flux2CoreDenoiseStep(SequentialPipelineBlocks):
"""
Core denoise step that performs the denoising process for Flux2-dev.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
Inputs:
num_images_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
prompt_embeds (`Tensor`):
Pre-generated text embeddings. Can be generated from text_encoder step.
height (`int`, *optional*):
TODO: Add description.
width (`int`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
num_inference_steps (`None`, *optional*, defaults to 50):
TODO: Add description.
timesteps (`None`, *optional*):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
guidance_scale (`None`, *optional*, defaults to 4.0):
TODO: Add description.
joint_attention_kwargs (`None`, *optional*):
TODO: Add description.
image_latents (`Tensor`, *optional*):
Packed image latents for conditioning. Shape: (B, img_seq_len, C)
image_latent_ids (`Tensor`, *optional*):
Position IDs for image latents. Shape: (B, img_seq_len, 4)
Outputs:
latents (`Tensor`):
Denoised latents.
"""
model_name = "flux2"
block_classes = Flux2CoreDenoiseBlocks.values()
block_names = Flux2CoreDenoiseBlocks.keys()
@property
def description(self):
return "Core denoise step that performs the denoising process for Flux2-dev."
@property
def outputs(self):
return [
OutputParam.template("latents"),
]
Flux2ImageConditionedCoreDenoiseBlocks = InsertableDict(
[
("input", Flux2TextInputStep()),
("prepare_image_latents", Flux2PrepareImageLatentsStep()),
("prepare_latents", Flux2PrepareLatentsStep()),
("set_timesteps", Flux2SetTimestepsStep()),
("prepare_guidance", Flux2PrepareGuidanceStep()),
("prepare_rope_inputs", Flux2RoPEInputsStep()),
("denoise", Flux2DenoiseStep()),
("after_denoise", Flux2UnpackLatentsStep()),
]
)
# auto_docstring
class Flux2ImageConditionedCoreDenoiseStep(SequentialPipelineBlocks):
"""
Core denoise step that performs the denoising process for Flux2-dev with image conditioning.
Components:
scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Flux2Transformer2DModel`)
Inputs:
num_images_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
prompt_embeds (`Tensor`):
Pre-generated text embeddings. Can be generated from text_encoder step.
image_latents (`list`, *optional*):
TODO: Add description.
height (`int`, *optional*):
TODO: Add description.
width (`int`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
num_inference_steps (`None`, *optional*, defaults to 50):
TODO: Add description.
timesteps (`None`, *optional*):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
guidance_scale (`None`, *optional*, defaults to 4.0):
TODO: Add description.
joint_attention_kwargs (`None`, *optional*):
TODO: Add description.
Outputs:
latents (`Tensor`):
Denoised latents.
"""
model_name = "flux2"
block_classes = Flux2ImageConditionedCoreDenoiseBlocks.values()
block_names = Flux2ImageConditionedCoreDenoiseBlocks.keys()
@property
def description(self):
return "Core denoise step that performs the denoising process for Flux2-dev with image conditioning."
@property
def outputs(self):
return [
OutputParam.template("latents"),
]
class Flux2AutoCoreDenoiseStep(AutoPipelineBlocks):
model_name = "flux2"
block_classes = [Flux2ImageConditionedCoreDenoiseStep, Flux2CoreDenoiseStep]
block_names = ["image_conditioned", "text2image"]
block_trigger_inputs = ["image_latents", None]
@property
def description(self):
return (
"Auto core denoise step that performs the denoising process for Flux2-dev."
"This is an auto pipeline block that works for text-to-image and image-conditioned generation."
" - `Flux2CoreDenoiseStep` is used for text-to-image generation.\n"
" - `Flux2ImageConditionedCoreDenoiseStep` is used for image-conditioned generation.\n"
)
AUTO_BLOCKS = InsertableDict(
[
("text_encoder", Flux2TextEncoderStep()),
("vae_encoder", Flux2AutoVaeEncoderStep()),
("denoise", Flux2AutoCoreDenoiseStep()),
("decode", Flux2DecodeStep()),
]
)
# auto_docstring
class Flux2AutoBlocks(SequentialPipelineBlocks):
"""
Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2.
Supported workflows:
- `text2image`: requires `prompt`
- `image_conditioned`: requires `image`, `prompt`
Components:
text_encoder (`Mistral3ForConditionalGeneration`) tokenizer (`AutoProcessor`) image_processor
(`Flux2ImageProcessor`) vae (`AutoencoderKLFlux2`) scheduler (`FlowMatchEulerDiscreteScheduler`) transformer
(`Flux2Transformer2DModel`)
Inputs:
prompt (`None`, *optional*):
TODO: Add description.
max_sequence_length (`int`, *optional*, defaults to 512):
TODO: Add description.
text_encoder_out_layers (`tuple`, *optional*, defaults to (10, 20, 30)):
TODO: Add description.
image (`None`, *optional*):
TODO: Add description.
height (`None`, *optional*):
TODO: Add description.
width (`None`, *optional*):
TODO: Add description.
generator (`None`, *optional*):
TODO: Add description.
num_images_per_prompt (`None`, *optional*, defaults to 1):
TODO: Add description.
image_latents (`list`, *optional*):
TODO: Add description.
latents (`Tensor | NoneType`):
TODO: Add description.
num_inference_steps (`None`):
TODO: Add description.
timesteps (`None`):
TODO: Add description.
sigmas (`None`, *optional*):
TODO: Add description.
guidance_scale (`None`, *optional*, defaults to 4.0):
TODO: Add description.
joint_attention_kwargs (`None`, *optional*):
TODO: Add description.
image_latent_ids (`Tensor`, *optional*):
Position IDs for image latents. Shape: (B, img_seq_len, 4)
output_type (`None`, *optional*, defaults to pil):
TODO: Add description.
Outputs:
images (`list`):
Generated images.
"""
model_name = "flux2"
block_classes = AUTO_BLOCKS.values()
block_names = AUTO_BLOCKS.keys()
_workflow_map = {
"text2image": {"prompt": True},
"image_conditioned": {"image": True, "prompt": True},
}
@property
def description(self):
return "Auto Modular pipeline for text-to-image and image-conditioned generation using Flux2."
@property
def outputs(self):
return [
OutputParam.template("images"),
]