Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

multimodalart HF Staff commited on 4 days ago

Commit

a1f24dc

verified ·

1 Parent(s): 4c728e2

Bundle exactly PR #6 (df4eb9b) as the diffusers source

Browse files

Files changed (14) hide show

diffusers_src/src/diffusers/__init__.py +4 -0
diffusers_src/src/diffusers/modular_pipelines/__init__.py +8 -0
diffusers_src/src/diffusers/modular_pipelines/ideogram4/__init__.py +47 -0
diffusers_src/src/diffusers/modular_pipelines/ideogram4/before_denoise.py +558 -0
diffusers_src/src/diffusers/modular_pipelines/ideogram4/decoders.py +112 -0
diffusers_src/src/diffusers/modular_pipelines/ideogram4/denoise.py +363 -0
diffusers_src/src/diffusers/modular_pipelines/ideogram4/encoders.py +304 -0
diffusers_src/src/diffusers/modular_pipelines/ideogram4/modular_blocks_ideogram4.py +184 -0
diffusers_src/src/diffusers/modular_pipelines/ideogram4/modular_pipeline.py +46 -0
diffusers_src/src/diffusers/modular_pipelines/modular_pipeline.py +1 -0
diffusers_src/src/diffusers/pipelines/auto_pipeline.py +2 -0
diffusers_src/src/diffusers/pipelines/ideogram4/pipeline_ideogram4.py +55 -76
diffusers_src/src/diffusers/pipelines/ideogram4/prompt_enhancer.py +115 -0
diffusers_src/src/diffusers/utils/dummy_torch_and_transformers_objects.py +30 -0

diffusers_src/src/diffusers/__init__.py CHANGED Viewed

@@ -476,6 +476,8 @@ else:
             "HeliosPyramidModularPipeline",
             "HunyuanVideo15AutoBlocks",
             "HunyuanVideo15ModularPipeline",
             "LTXAutoBlocks",
             "LTXModularPipeline",
             "QwenImageAutoBlocks",
@@ -1297,6 +1299,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             HeliosPyramidModularPipeline,
             HunyuanVideo15AutoBlocks,
             HunyuanVideo15ModularPipeline,
             LTXAutoBlocks,
             LTXModularPipeline,
             QwenImageAutoBlocks,

             "HeliosPyramidModularPipeline",
             "HunyuanVideo15AutoBlocks",
             "HunyuanVideo15ModularPipeline",
+            "Ideogram4AutoBlocks",
+            "Ideogram4ModularPipeline",
             "LTXAutoBlocks",
             "LTXModularPipeline",
             "QwenImageAutoBlocks",
             HeliosPyramidModularPipeline,
             HunyuanVideo15AutoBlocks,
             HunyuanVideo15ModularPipeline,
+            Ideogram4AutoBlocks,
+            Ideogram4ModularPipeline,
             LTXAutoBlocks,
             LTXModularPipeline,
             QwenImageAutoBlocks,

diffusers_src/src/diffusers/modular_pipelines/__init__.py CHANGED Viewed

@@ -79,6 +79,10 @@ else:
         "Flux2KleinModularPipeline",
         "Flux2KleinBaseModularPipeline",
     ]
     _import_structure["qwenimage"] = [
         "QwenImageAutoBlocks",
         "QwenImageModularPipeline",
@@ -142,6 +146,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
             HunyuanVideo15AutoBlocks,
             HunyuanVideo15ModularPipeline,
         )
         from .ltx import LTXAutoBlocks, LTXModularPipeline
         from .modular_pipeline import (
             AutoPipelineBlocks,

         "Flux2KleinModularPipeline",
         "Flux2KleinBaseModularPipeline",
     ]
+    _import_structure["ideogram4"] = [
+        "Ideogram4AutoBlocks",
+        "Ideogram4ModularPipeline",
+    ]
     _import_structure["qwenimage"] = [
         "QwenImageAutoBlocks",
         "QwenImageModularPipeline",
             HunyuanVideo15AutoBlocks,
             HunyuanVideo15ModularPipeline,
         )
+        from .ideogram4 import (
+            Ideogram4AutoBlocks,
+            Ideogram4ModularPipeline,
+        )
         from .ltx import LTXAutoBlocks, LTXModularPipeline
         from .modular_pipeline import (
             AutoPipelineBlocks,

diffusers_src/src/diffusers/modular_pipelines/ideogram4/__init__.py ADDED Viewed

	@@ -0,0 +1,47 @@

+from typing import TYPE_CHECKING
+from ...utils import (
+    DIFFUSERS_SLOW_IMPORT,
+    OptionalDependencyNotAvailable,
+    _LazyModule,
+    get_objects_from_module,
+    is_torch_available,
+    is_transformers_available,
+)
+_dummy_objects = {}
+_import_structure = {}
+try:
+    if not (is_transformers_available() and is_torch_available()):
+        raise OptionalDependencyNotAvailable()
+except OptionalDependencyNotAvailable:
+    from ...utils import dummy_torch_and_transformers_objects  # noqa F403
+    _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
+else:
+    _import_structure["modular_blocks_ideogram4"] = ["Ideogram4AutoBlocks"]
+    _import_structure["modular_pipeline"] = ["Ideogram4ModularPipeline"]
+if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
+    try:
+        if not (is_transformers_available() and is_torch_available()):
+            raise OptionalDependencyNotAvailable()
+    except OptionalDependencyNotAvailable:
+        from ...utils.dummy_torch_and_transformers_objects import *  # noqa F403
+    else:
+        from .modular_blocks_ideogram4 import Ideogram4AutoBlocks
+        from .modular_pipeline import Ideogram4ModularPipeline
+else:
+    import sys
+    sys.modules[__name__] = _LazyModule(
+        __name__,
+        globals()["__file__"],
+        _import_structure,
+        module_spec=__spec__,
+    )
+    for name, value in _dummy_objects.items():
+        setattr(sys.modules[__name__], name, value)

diffusers_src/src/diffusers/modular_pipelines/ideogram4/before_denoise.py ADDED Viewed

	@@ -0,0 +1,558 @@

+# Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+import torch
+from ...models.transformers.transformer_ideogram4 import (
+    IMAGE_POSITION_OFFSET,
+    LLM_TOKEN_INDICATOR,
+    OUTPUT_IMAGE_INDICATOR,
+    SEQUENCE_PADDING_INDICATOR,
+    Ideogram4Transformer2DModel,
+)
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ...utils.torch_utils import randn_tensor
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import Ideogram4ModularPipeline
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# Default per-step guidance schedule (length must equal `num_inference_steps`): 7.0 for the main steps,
+# dropping to 3.0 for the final 3 "polish" steps.
+DEFAULT_GUIDANCE_SCHEDULE = (7.0,) * 45 + (3.0,) * 3
+# Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4._logit_normal_sigmas
+def _logit_normal_sigmas(
+    num_inference_steps: int,
+    mu: float,
+    std: float = 1.0,
+    logsnr_min: float = -15.0,
+    logsnr_max: float = 18.0,
+    device: torch.device | None = None,
+) -> torch.Tensor:
+    r"""
+    Build a length-`num_inference_steps` sigma schedule using the Ideogram4 logit-normal flow-matching schedule.
+    Sigmas are returned in `[0, 1]` in decreasing order (sigma close to 1 corresponds to pure noise, sigma close to 0
+    to clean data), matching diffusers conventions.
+    The Ideogram4 schedule applies `sigma(s) = 1 - logit_normal_cdf_inverse(1 - s)` to `s = linspace(0, 1, N + 1)` and
+    keeps the first `N` entries; a terminal zero is appended downstream by the scheduler.
+    """
+    intervals = torch.linspace(0.0, 1.0, num_inference_steps + 1, dtype=torch.float64)
+    # Apply the inverse CDF of a normal then push through the logistic to obtain a logit-normal CDF inverse.
+    z = torch.special.ndtri(intervals)
+    y = mu + std * z
+    t = 1.0 - torch.special.expit(y)
+    t_min = 1.0 / (1.0 + math.exp(0.5 * logsnr_max))
+    t_max = 1.0 / (1.0 + math.exp(0.5 * logsnr_min))
+    t = t.clamp(t_min, t_max)
+    # Convert from model time (0 = noise, 1 = data) to diffusers sigma (1 = noise, 0 = data) and reverse.
+    sigmas = (1.0 - t).flip(0)
+    # Drop the trailing 0; FlowMatchEulerDiscreteScheduler.set_timesteps appends one back internally.
+    sigmas = sigmas[:-1].to(dtype=torch.float32, device=device)
+    return sigmas
+# Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4._resolution_aware_mu
+def _resolution_aware_mu(
+    height: int,
+    width: int,
+    base_mu: float,
+    base_resolution: tuple[int, int] = (512, 512),
+) -> float:
+    """Shift the schedule mean as a function of image resolution."""
+    num_pixels = height * width
+    base_pixels = base_resolution[0] * base_resolution[1]
+    return base_mu + 0.5 * math.log(num_pixels / base_pixels)
+# Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4._expand_tensor_to_effective_batch
+def _expand_tensor_to_effective_batch(
+    tensor: torch.Tensor,
+    batch_size: int,
+    num_per_prompt: int,
+    tensor_name: str | None = None,
+) -> torch.Tensor:
+    """Replicate `tensor` along dim 0 from `batch_size` (or 1) to `batch_size * num_per_prompt`."""
+    target_batch_size = batch_size * num_per_prompt
+    if tensor.shape[0] == target_batch_size:
+        return tensor
+    if tensor.shape[0] == 1:
+        repeat_by = target_batch_size
+    elif tensor.shape[0] == batch_size:
+        repeat_by = num_per_prompt
+    else:
+        tensor_name = f"`{tensor_name}`" if tensor_name is not None else "Tensor"
+        raise ValueError(
+            f"{tensor_name} batch size must be 1, `batch_size` ({batch_size}), or "
+            f"`batch_size * num_*_per_prompt` ({target_batch_size}), but got {tensor.shape[0]}."
+        )
+    return torch.repeat_interleave(tensor, repeats=repeat_by, dim=0, output_size=tensor.shape[0] * repeat_by)
+# auto_docstring
+class Ideogram4TextInputsStep(ModularPipelineBlocks):
+    """
+    Input step that determines `batch_size`/`dtype` from the per-prompt `text_features` and replicates the text outputs
+    to `batch_size * num_images_per_prompt`. Place after the text encoder.
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          text_features (`Tensor`):
+              Per-prompt text features from the encoder.
+          text_lengths (`list`):
+              Per-prompt text-token counts from the encoder.
+      Outputs:
+          batch_size (`int`):
+              Effective batch size (num prompts * num_images_per_prompt).
+          dtype (`dtype`):
+              The dtype of the text features.
+          text_features (`Tensor`):
+              Text features, batch-expanded.
+          text_lengths (`list`):
+              Text-token counts, batch-expanded.
+    """
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return (
+            "Input step that determines `batch_size`/`dtype` from the per-prompt `text_features` and replicates the "
+            "text outputs to `batch_size * num_images_per_prompt`. Place after the text encoder."
+        )
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("num_images_per_prompt", default=1),
+            InputParam(
+                name="text_features",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Per-prompt text features from the encoder.",
+            ),
+            InputParam(
+                name="text_lengths",
+                required=True,
+                type_hint=list,
+                description="Per-prompt text-token counts from the encoder.",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                name="batch_size",
+                type_hint=int,
+                description="Effective batch size (num prompts * num_images_per_prompt).",
+            ),
+            OutputParam(name="dtype", type_hint=torch.dtype, description="The dtype of the text features."),
+            OutputParam(name="text_features", type_hint=torch.Tensor, description="Text features, batch-expanded."),
+            OutputParam(name="text_lengths", type_hint=list, description="Text-token counts, batch-expanded."),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        prompt_batch = block_state.text_features.shape[0]
+        num_per_prompt = block_state.num_images_per_prompt
+        block_state.dtype = block_state.text_features.dtype
+        block_state.text_features = _expand_tensor_to_effective_batch(
+            block_state.text_features, prompt_batch, num_per_prompt, "text_features"
+        )
+        block_state.text_lengths = [n for n in block_state.text_lengths for _ in range(num_per_prompt)]
+        block_state.batch_size = prompt_batch * num_per_prompt
+        self.set_block_state(state, block_state)
+        return components, state
+# auto_docstring
+class Ideogram4PrepareLatentsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the packed image latents (B, num_image_tokens, latent_dim) for the denoising loop.
+      Components:
+          transformer (`Ideogram4Transformer2DModel`)
+      Inputs:
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          batch_size (`int`):
+              Effective batch size.
+      Outputs:
+          latents (`Tensor`):
+              The initial packed image latents (B, num_image_tokens, latent_dim).
+          num_image_tokens (`int`):
+              Number of image tokens (grid_h * grid_w).
+    """
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return "Step that prepares the packed image latents (B, num_image_tokens, latent_dim) for the denoising loop."
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("transformer", Ideogram4Transformer2DModel)]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("latents"),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam.template("generator"),
+            InputParam(name="batch_size", required=True, type_hint=int, description="Effective batch size."),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="The initial packed image latents (B, num_image_tokens, latent_dim).",
+            ),
+            OutputParam(
+                name="num_image_tokens", type_hint=int, description="Number of image tokens (grid_h * grid_w)."
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        patch = components.patch_size
+        grid_h = block_state.height // (components.vae_scale_factor * patch)
+        grid_w = block_state.width // (components.vae_scale_factor * patch)
+        num_image_tokens = grid_h * grid_w
+        latent_dim = components.transformer.config.in_channels
+        shape = (block_state.batch_size, num_image_tokens, latent_dim)
+        if block_state.latents is None:
+            block_state.latents = randn_tensor(
+                shape, generator=block_state.generator, device=device, dtype=torch.float32
+            )
+        else:
+            block_state.latents = block_state.latents.to(device=device, dtype=torch.float32)
+        block_state.num_image_tokens = num_image_tokens
+        self.set_block_state(state, block_state)
+        return components, state
+# auto_docstring
+class Ideogram4SetTimestepsStep(ModularPipelineBlocks):
+    """
+    Step that sets the resolution-aware logit-normal sigma schedule on the scheduler and resolves the per-step guidance
+    weights.
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`)
+      Inputs:
+          num_inference_steps (`int`, *optional*, defaults to 48):
+              The number of denoising steps.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          mu (`float`, *optional*, defaults to 0.0):
+              Base mean of the logit-normal schedule.
+          std (`float`, *optional*, defaults to 1.5):
+              Std of the logit-normal schedule.
+          guidance_schedule (`list`, *optional*, defaults to (7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
+          7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
+          7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 3.0, 3.0, 3.0)):
+              Per-step guidance scale schedule (length num_inference_steps).
+      Outputs:
+          timesteps (`Tensor`):
+              The denoising timesteps.
+          gw (`Tensor`):
+              Per-step guidance weights (num_inference_steps,).
+    """
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return (
+            "Step that sets the resolution-aware logit-normal sigma schedule on the scheduler and resolves the "
+            "per-step guidance weights."
+        )
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("num_inference_steps", default=48),
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam(name="mu", default=0.0, type_hint=float, description="Base mean of the logit-normal schedule."),
+            InputParam(name="std", default=1.5, type_hint=float, description="Std of the logit-normal schedule."),
+            InputParam(
+                name="guidance_schedule",
+                default=DEFAULT_GUIDANCE_SCHEDULE,
+                type_hint=list,
+                description="Per-step guidance scale schedule (length num_inference_steps).",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(name="timesteps", type_hint=torch.Tensor, description="The denoising timesteps."),
+            OutputParam(
+                name="gw", type_hint=torch.Tensor, description="Per-step guidance weights (num_inference_steps,)."
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        if len(block_state.guidance_schedule) != block_state.num_inference_steps:
+            raise ValueError(
+                f"`guidance_schedule` must have length `num_inference_steps` ({block_state.num_inference_steps}), "
+                f"got {len(block_state.guidance_schedule)}."
+            )
+        schedule_mu = _resolution_aware_mu(height=block_state.height, width=block_state.width, base_mu=block_state.mu)
+        sigmas = _logit_normal_sigmas(block_state.num_inference_steps, schedule_mu, std=block_state.std, device=device)
+        components.scheduler.set_timesteps(sigmas=sigmas.tolist(), device=device)
+        block_state.timesteps = components.scheduler.timesteps
+        block_state.gw = torch.as_tensor(block_state.guidance_schedule, dtype=torch.float32, device=device)
+        self.set_block_state(state, block_state)
+        return components, state
+# auto_docstring
+class Ideogram4PrepareAdditionalInputsStep(ModularPipelineBlocks):
+    """
+    Step that prepares the additional denoiser inputs from the packed-sequence layout: the conditional
+    encoder_hidden_states (text features packed with image padding) and the position_ids/segment_ids/indicator, plus
+    the unconditional (image-only) counterparts. Place after prepare_latents.
+      Inputs:
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          text_features (`Tensor`):
+              Batch-expanded text features.
+          text_lengths (`list`):
+              Batch-expanded text-token counts.
+          batch_size (`int`):
+              Effective batch size.
+      Outputs:
+          prompt_embeds (`Tensor`):
+              Packed conditional encoder_hidden_states (B, total_seq, dim).
+          position_ids (`Tensor`):
+              Conditional 3-axis MRoPE position ids.
+          segment_ids (`Tensor`):
+              Conditional block-diagonal segment ids.
+          indicator (`Tensor`):
+              Conditional per-token text/image/pad role.
+          negative_prompt_embeds (`Tensor`):
+              Unconditional (zeroed) text features (B, num_image_tokens, dim).
+          negative_position_ids (`Tensor`):
+              Unconditional position ids (image region).
+          negative_segment_ids (`Tensor`):
+              Unconditional segment ids (image region).
+          negative_indicator (`Tensor`):
+              Unconditional indicator (image region).
+    """
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return (
+            "Step that prepares the additional denoiser inputs from the packed-sequence layout: the conditional "
+            "encoder_hidden_states (text features packed with image padding) and the position_ids/segment_ids/"
+            "indicator, plus the unconditional (image-only) counterparts. Place after prepare_latents."
+        )
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam(
+                name="text_features",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Batch-expanded text features.",
+            ),
+            InputParam(
+                name="text_lengths", required=True, type_hint=list, description="Batch-expanded text-token counts."
+            ),
+            InputParam(name="batch_size", required=True, type_hint=int, description="Effective batch size."),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                name="prompt_embeds",
+                type_hint=torch.Tensor,
+                description="Packed conditional encoder_hidden_states (B, total_seq, dim).",
+            ),
+            OutputParam(
+                name="position_ids", type_hint=torch.Tensor, description="Conditional 3-axis MRoPE position ids."
+            ),
+            OutputParam(
+                name="segment_ids", type_hint=torch.Tensor, description="Conditional block-diagonal segment ids."
+            ),
+            OutputParam(
+                name="indicator", type_hint=torch.Tensor, description="Conditional per-token text/image/pad role."
+            ),
+            OutputParam(
+                name="negative_prompt_embeds",
+                type_hint=torch.Tensor,
+                description="Unconditional (zeroed) text features (B, num_image_tokens, dim).",
+            ),
+            OutputParam(
+                name="negative_position_ids",
+                type_hint=torch.Tensor,
+                description="Unconditional position ids (image region).",
+            ),
+            OutputParam(
+                name="negative_segment_ids",
+                type_hint=torch.Tensor,
+                description="Unconditional segment ids (image region).",
+            ),
+            OutputParam(
+                name="negative_indicator",
+                type_hint=torch.Tensor,
+                description="Unconditional indicator (image region).",
+            ),
+        ]
+    @staticmethod
+    # Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4.Ideogram4Pipeline._prepare_ids
+    def _prepare_ids(
+        text_lengths: list[int],
+        grid_h: int,
+        grid_w: int,
+        max_text_tokens: int,
+        device: torch.device,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Build the packed `[left-pad][text][image]` layout from the per-prompt text lengths and the image grid.
+        Returns `position_ids` (3-axis MRoPE), `segment_ids` (block-diagonal attention) and `indicator` (per-token
+        text/image/pad role).
+        """
+        batch_size = len(text_lengths)
+        num_image_tokens = grid_h * grid_w
+        total_seq_len = max_text_tokens + num_image_tokens
+        # Image position ids (t=0, h, w); offset keeps them disjoint from text positions.
+        h_idx = torch.arange(grid_h).view(-1, 1).expand(grid_h, grid_w).reshape(-1)
+        w_idx = torch.arange(grid_w).view(1, -1).expand(grid_h, grid_w).reshape(-1)
+        t_idx = torch.zeros_like(h_idx)
+        image_pos = torch.stack([t_idx, h_idx, w_idx], dim=1) + IMAGE_POSITION_OFFSET
+        position_ids = torch.zeros(batch_size, total_seq_len, 3, dtype=torch.long)
+        segment_ids = torch.full((batch_size, total_seq_len), SEQUENCE_PADDING_INDICATOR, dtype=torch.long)
+        indicator = torch.zeros(batch_size, total_seq_len, dtype=torch.long)
+        for b, num_text in enumerate(text_lengths):
+            offset = max_text_tokens - num_text
+            text_pos = torch.arange(num_text)
+            text_pos_3d = torch.stack([text_pos, text_pos, text_pos], dim=1)
+            position_ids[b, offset : offset + num_text] = text_pos_3d
+            position_ids[b, offset + num_text :] = image_pos
+            indicator[b, offset : offset + num_text] = LLM_TOKEN_INDICATOR
+            indicator[b, offset + num_text :] = OUTPUT_IMAGE_INDICATOR
+            segment_ids[b, offset : offset + num_text + num_image_tokens] = 1
+        return position_ids.to(device), segment_ids.to(device), indicator.to(device)
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        patch = components.patch_size
+        grid_h = block_state.height // (components.vae_scale_factor * patch)
+        grid_w = block_state.width // (components.vae_scale_factor * patch)
+        num_image_tokens = grid_h * grid_w
+        text_features = block_state.text_features
+        max_text_tokens = text_features.shape[1]
+        feature_dim = text_features.shape[-1]
+        position_ids, segment_ids, indicator = self._prepare_ids(
+            block_state.text_lengths, grid_h, grid_w, max_text_tokens, device
+        )
+        # Pack the text features into the full sequence; image positions carry no text features.
+        image_feature_padding = torch.zeros(
+            block_state.batch_size, num_image_tokens, feature_dim, dtype=text_features.dtype, device=device
+        )
+        block_state.prompt_embeds = torch.cat([text_features, image_feature_padding], dim=1)
+        # Unconditional (image-only) branch, derived from the conditioning.
+        block_state.negative_prompt_embeds = torch.zeros(
+            block_state.batch_size, num_image_tokens, feature_dim, dtype=text_features.dtype, device=device
+        )
+        block_state.position_ids = position_ids
+        block_state.segment_ids = segment_ids
+        block_state.indicator = indicator
+        block_state.negative_position_ids = position_ids[:, max_text_tokens:]
+        block_state.negative_segment_ids = segment_ids[:, max_text_tokens:]
+        block_state.negative_indicator = indicator[:, max_text_tokens:]
+        self.set_block_state(state, block_state)
+        return components, state

diffusers_src/src/diffusers/modular_pipelines/ideogram4/decoders.py ADDED Viewed

	@@ -0,0 +1,112 @@

+# Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from ...configuration_utils import FrozenDict
+from ...image_processor import VaeImageProcessor
+from ...models import AutoencoderKLFlux2
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import Ideogram4ModularPipeline
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# auto_docstring
+class Ideogram4DecodeStep(ModularPipelineBlocks):
+    """
+    Step that decodes the unpatchified (B, ae_channels, H, W) latents into images: de-normalizes with the VAE
+    batch-norm statistics and decodes through the VAE.
+      Components:
+          vae (`AutoencoderKLFlux2`) image_processor (`VaeImageProcessor`)
+      Inputs:
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+          latents (`Tensor`):
+              The unpatchified (B, ae_channels, H, W) latents to decode, from the after-denoise step.
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return (
+            "Step that decodes the unpatchified (B, ae_channels, H, W) latents into images: de-normalizes with the "
+            "VAE batch-norm statistics and decodes through the VAE."
+        )
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("vae", AutoencoderKLFlux2),
+            ComponentSpec(
+                "image_processor",
+                VaeImageProcessor,
+                config=FrozenDict({"vae_scale_factor": 16}),
+                default_creation_method="from_config",
+            ),
+        ]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("output_type", default="pil"),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The unpatchified (B, ae_channels, H, W) latents to decode, from the after-denoise step.",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [OutputParam.template("images")]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        z = block_state.latents
+        patch = components.patch_size
+        ae_channels = z.shape[1]
+        grid_h, grid_w = z.shape[2] // patch, z.shape[3] // patch
+        # VAE bn stores per-channel statistics over the packed channels, laid out as (patch_row, patch_col,
+        # ae_channel). Reshape them into an (ae_channels, patch, patch) tile and repeat across the grid so the
+        # denormalization on the unpatchified latents matches the packed-space statistics.
+        bn_mean = components.vae.bn.running_mean.view(patch, patch, ae_channels).permute(2, 0, 1)
+        bn_std = torch.sqrt(components.vae.bn.running_var + components.vae.config.batch_norm_eps)
+        bn_std = bn_std.view(patch, patch, ae_channels).permute(2, 0, 1)
+        bn_mean = bn_mean.repeat(1, grid_h, grid_w).to(device=z.device, dtype=z.dtype)
+        bn_std = bn_std.repeat(1, grid_h, grid_w).to(device=z.device, dtype=z.dtype)
+        z = z * bn_std + bn_mean
+        decoded = components.vae.decode(z.to(components.vae.dtype), return_dict=False)[0]
+        block_state.images = components.image_processor.postprocess(
+            decoded.float(), output_type=block_state.output_type
+        )
+        self.set_block_state(state, block_state)
+        return components, state

diffusers_src/src/diffusers/modular_pipelines/ideogram4/denoise.py ADDED Viewed

	@@ -0,0 +1,363 @@

+# Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from ...models.transformers.transformer_ideogram4 import Ideogram4Transformer2DModel
+from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging
+from ..modular_pipeline import (
+    BlockState,
+    LoopSequentialPipelineBlocks,
+    ModularPipelineBlocks,
+    PipelineState,
+)
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import Ideogram4ModularPipeline
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class Ideogram4LoopBeforeDenoiser(ModularPipelineBlocks):
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return (
+            "Within the denoising loop: build the conditional packed input `[text-padding][image latents]` and the "
+            "model timestep. Compose into the `sub_blocks` of `Ideogram4DenoiseLoopWrapper`."
+        )
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(name="latents", required=True, type_hint=torch.Tensor, description="Packed image latents."),
+            InputParam(
+                name="position_ids", required=True, type_hint=torch.Tensor, description="Conditional position ids."
+            ),
+            InputParam(name="batch_size", required=True, type_hint=int, description="Effective batch size."),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        # Conditional packed sequence is [text-padding][image latents]; text region length = total - image tokens.
+        max_text_tokens = block_state.position_ids.shape[1] - block_state.latents.shape[1]
+        text_z_padding = torch.zeros(
+            block_state.latents.shape[0],
+            max_text_tokens,
+            block_state.latents.shape[-1],
+            dtype=block_state.latents.dtype,
+            device=block_state.latents.device,
+        )
+        block_state.pos_z = torch.cat([text_z_padding, block_state.latents], dim=1)
+        block_state.max_text_tokens = max_text_tokens
+        # Map sigma-domain timestep to model time t in [0, 1] (0 = noise, 1 = clean data).
+        num_train_timesteps = components.scheduler.config.num_train_timesteps
+        t_model = 1.0 - (t.float() / num_train_timesteps)
+        block_state.t_model = t_model.expand(block_state.batch_size)
+        return components, block_state
+class Ideogram4LoopDenoiser(ModularPipelineBlocks):
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return (
+            "Within the denoising loop: run the conditional `transformer` on the full packed sequence and the "
+            "`unconditional_transformer` on the image-only sequence, then blend with the per-step guidance weight "
+            "(asymmetric CFG, no guider). Compose into `Ideogram4DenoiseLoopWrapper`."
+        )
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("transformer", Ideogram4Transformer2DModel),
+            ComponentSpec("unconditional_transformer", Ideogram4Transformer2DModel),
+        ]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                name="prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Packed conditional encoder_hidden_states.",
+            ),
+            InputParam(
+                name="position_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Conditional 3-axis MRoPE position ids.",
+            ),
+            InputParam(
+                name="segment_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Conditional block-diagonal segment ids.",
+            ),
+            InputParam(
+                name="indicator",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Conditional per-token text/image/pad role.",
+            ),
+            InputParam(
+                name="negative_prompt_embeds",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Unconditional (zeroed) text features.",
+            ),
+            InputParam(
+                name="negative_position_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Unconditional position ids (image region).",
+            ),
+            InputParam(
+                name="negative_segment_ids",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Unconditional segment ids (image region).",
+            ),
+            InputParam(
+                name="negative_indicator",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Unconditional indicator (image region).",
+            ),
+            InputParam(name="gw", required=True, type_hint=torch.Tensor, description="Per-step guidance weights."),
+            InputParam(name="latents", required=True, type_hint=torch.Tensor, description="Packed image latents."),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        transformer = components.transformer
+        unconditional_transformer = components.unconditional_transformer
+        # Conditional pass operates on the full packed sequence; the velocity is the image-token region.
+        pos_out = transformer(
+            hidden_states=block_state.pos_z.to(transformer.dtype),
+            timestep=block_state.t_model.to(transformer.dtype),
+            encoder_hidden_states=block_state.prompt_embeds.to(transformer.dtype),
+            position_ids=block_state.position_ids,
+            segment_ids=block_state.segment_ids,
+            indicator=block_state.indicator,
+            return_dict=False,
+        )[0]
+        pos_v = pos_out[:, block_state.max_text_tokens :].to(torch.float32)
+        # Unconditional pass uses the image-only positions with zeroed text features.
+        neg_v = unconditional_transformer(
+            hidden_states=block_state.latents.to(unconditional_transformer.dtype),
+            timestep=block_state.t_model.to(unconditional_transformer.dtype),
+            encoder_hidden_states=block_state.negative_prompt_embeds.to(unconditional_transformer.dtype),
+            position_ids=block_state.negative_position_ids,
+            segment_ids=block_state.negative_segment_ids,
+            indicator=block_state.negative_indicator,
+            return_dict=False,
+        )[0].to(torch.float32)
+        gw_i = block_state.gw[i]
+        v = gw_i * pos_v + (1.0 - gw_i) * neg_v
+        # The scheduler integrates `-v` (Ideogram predicts velocity v = x0 - noise).
+        block_state.noise_pred = -v
+        return components, block_state
+class Ideogram4LoopAfterDenoiser(ModularPipelineBlocks):
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return "Within the denoising loop: scheduler step. Compose into `Ideogram4DenoiseLoopWrapper`."
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [OutputParam(name="latents", type_hint=torch.Tensor, description="The denoised latents.")]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
+        block_state.latents = components.scheduler.step(
+            block_state.noise_pred, t, block_state.latents, return_dict=False
+        )[0]
+        return components, block_state
+# auto_docstring
+class Ideogram4DenoiseStep(LoopSequentialPipelineBlocks):
+    """
+    Denoising loop that iteratively denoises the packed image latents over `timesteps`, running both the conditional
+    and unconditional transformers and blending with the per-step guidance schedule.
+      Components:
+          scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Ideogram4Transformer2DModel`)
+          unconditional_transformer (`Ideogram4Transformer2DModel`)
+      Inputs:
+          timesteps (`Tensor`):
+              Denoising timesteps from set_timesteps.
+          num_inference_steps (`int`, *optional*, defaults to 48):
+              The number of denoising steps.
+          latents (`Tensor`):
+              Packed image latents.
+          position_ids (`Tensor`):
+              Conditional position ids.
+          batch_size (`int`):
+              Effective batch size.
+          prompt_embeds (`Tensor`):
+              Packed conditional encoder_hidden_states.
+          position_ids (`Tensor`):
+              Conditional 3-axis MRoPE position ids.
+          segment_ids (`Tensor`):
+              Conditional block-diagonal segment ids.
+          indicator (`Tensor`):
+              Conditional per-token text/image/pad role.
+          negative_prompt_embeds (`Tensor`):
+              Unconditional (zeroed) text features.
+          negative_position_ids (`Tensor`):
+              Unconditional position ids (image region).
+          negative_segment_ids (`Tensor`):
+              Unconditional segment ids (image region).
+          negative_indicator (`Tensor`):
+              Unconditional indicator (image region).
+          gw (`Tensor`):
+              Per-step guidance weights.
+      Outputs:
+          latents (`Tensor`):
+              The denoised latents.
+    """
+    model_name = "ideogram4"
+    block_classes = [Ideogram4LoopBeforeDenoiser, Ideogram4LoopDenoiser, Ideogram4LoopAfterDenoiser]
+    block_names = ["before_denoiser", "denoiser", "after_denoiser"]
+    @property
+    def description(self) -> str:
+        return (
+            "Denoising loop that iteratively denoises the packed image latents over `timesteps`, running both the "
+            "conditional and unconditional transformers and blending with the per-step guidance schedule."
+        )
+    @property
+    def loop_expected_components(self) -> list[ComponentSpec]:
+        return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
+    @property
+    def loop_inputs(self) -> list[InputParam]:
+        return [
+            InputParam(
+                name="timesteps",
+                required=True,
+                type_hint=torch.Tensor,
+                description="Denoising timesteps from set_timesteps.",
+            ),
+            InputParam.template("num_inference_steps", default=48),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
+            for i, t in enumerate(block_state.timesteps):
+                components, block_state = self.loop_step(components, block_state, i=i, t=t)
+                progress_bar.update()
+        self.set_block_state(state, block_state)
+        return components, state
+# auto_docstring
+class Ideogram4AfterDenoiseStep(ModularPipelineBlocks):
+    """
+    Step that runs after the denoising loop: unpatchifies the packed image latents (B, num_image_tokens, ae_channels *
+    patch ** 2) into a (B, ae_channels, H, W) latent for the decoder.
+      Inputs:
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          latents (`Tensor`):
+              The denoised packed image latents (B, num_image_tokens, latent_dim).
+      Outputs:
+          latents (`Tensor`):
+              Unpatchified latents (B, ae_channels, H, W) ready for the VAE decoder.
+    """
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return (
+            "Step that runs after the denoising loop: unpatchifies the packed image latents "
+            "(B, num_image_tokens, ae_channels * patch ** 2) into a (B, ae_channels, H, W) latent for the decoder."
+        )
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("height", required=True),
+            InputParam.template("width", required=True),
+            InputParam(
+                name="latents",
+                required=True,
+                type_hint=torch.Tensor,
+                description="The denoised packed image latents (B, num_image_tokens, latent_dim).",
+            ),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                name="latents",
+                type_hint=torch.Tensor,
+                description="Unpatchified latents (B, ae_channels, H, W) ready for the VAE decoder.",
+            )
+        ]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        z = block_state.latents
+        patch = components.patch_size
+        grid_h = block_state.height // (components.vae_scale_factor * patch)
+        grid_w = block_state.width // (components.vae_scale_factor * patch)
+        ae_channels = z.shape[-1] // (patch * patch)
+        z = z.view(z.shape[0], grid_h, grid_w, patch, patch, ae_channels)
+        z = z.permute(0, 5, 1, 3, 2, 4).contiguous()
+        z = z.view(z.shape[0], ae_channels, grid_h * patch, grid_w * patch)
+        block_state.latents = z
+        self.set_block_state(state, block_state)
+        return components, state

diffusers_src/src/diffusers/modular_pipelines/ideogram4/encoders.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from transformers import Qwen2Tokenizer, Qwen3VLModel
+from transformers.masking_utils import create_causal_mask
+from ...pipelines.ideogram4.prompt_enhancer import (
+    DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
+    PROMPT_UPSAMPLE_TEMPERATURE,
+    generate_captions,
+    graft_lm_head,
+)
+from ...utils import logging
+from ..modular_pipeline import ModularPipelineBlocks, PipelineState
+from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
+from .modular_pipeline import Ideogram4ModularPipeline
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# Hidden states of these Qwen3-VL decoder layers are concatenated to form the per-token
+# text conditioning consumed by the Ideogram4 transformer.
+QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
+# auto_docstring
+class Ideogram4PromptUpsampleStep(ModularPipelineBlocks):
+    """
+    Optional step that rewrites the prompt(s) into Ideogram4's native structured JSON caption (the format the model
+    is trained on) when ``prompt_upsampling=True``. On first use it grafts a hosted LM head onto the (head-less)
+    text encoder to make it generative; install ``outlines`` for schema-constrained captions.
+      Components:
+          text_encoder (`Qwen3VLModel`): The Qwen3-VL text encoder. tokenizer (`Qwen2Tokenizer`): The tokenizer
+          paired with the text encoder.
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          prompt_upsampling (`bool`, *optional*, defaults to False):
+              If True, rewrite the prompt into the native JSON caption before encoding.
+          prompt_upsampling_temperature (`float`, *optional*, defaults to 1.0):
+              Sampling temperature for prompt upsampling.
+          height (`int`, *optional*):
+              Together with width, sets the caption's target aspect ratio.
+          width (`int`, *optional*):
+              Together with height, sets the caption's target aspect ratio.
+          generator (`Generator`, *optional*):
+              Reused to make the upsampling reproducible.
+      Outputs:
+          prompt (`str`):
+              The (possibly upsampled) prompt forwarded to the text encoder.
+    """
+    model_name = "ideogram4"
+    def __init__(self, lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO):
+        self._lm_head_repo_id = lm_head_repo_id
+        # Grafted lazily on first upsample and cached (the encoder body is shared).
+        self._prompt_enhancer = None
+        self._caption_logits_processor = None
+        super().__init__()
+    @property
+    def description(self) -> str:
+        return (
+            "Optional step that rewrites the prompt(s) into Ideogram4's native structured JSON caption when "
+            "`prompt_upsampling=True` (the format the model is trained on). On first use it grafts a hosted LM head "
+            "onto the text encoder; install `outlines` for schema-constrained captions."
+        )
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen3VLModel, description="The Qwen3-VL text encoder."),
+            ComponentSpec("tokenizer", Qwen2Tokenizer, description="The tokenizer paired with the text encoder."),
+        ]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("prompt", required=True),
+            InputParam(
+                name="prompt_upsampling",
+                type_hint=bool,
+                default=False,
+                description="If True, rewrite the prompt into Ideogram4's native JSON caption before encoding.",
+            ),
+            InputParam(
+                name="prompt_upsampling_temperature",
+                type_hint=float,
+                default=PROMPT_UPSAMPLE_TEMPERATURE,
+                description="Sampling temperature for prompt upsampling.",
+            ),
+            InputParam.template("height"),
+            InputParam.template("width"),
+            InputParam.template("generator"),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                name="prompt",
+                type_hint=list,
+                description="The (possibly upsampled) prompt forwarded to the text encoder.",
+            ),
+        ]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        if block_state.prompt_upsampling:
+            if self._prompt_enhancer is None:
+                self._prompt_enhancer, self._caption_logits_processor = graft_lm_head(
+                    components.text_encoder, components.tokenizer, self._lm_head_repo_id
+                )
+            height = block_state.height or components.default_height
+            width = block_state.width or components.default_width
+            block_state.prompt = generate_captions(
+                self._prompt_enhancer,
+                components.tokenizer,
+                self._caption_logits_processor,
+                block_state.prompt,
+                height,
+                width,
+                temperature=block_state.prompt_upsampling_temperature,
+                generator=block_state.generator,
+                device=components._execution_device,
+            )
+        self.set_block_state(state, block_state)
+        return components, state
+# auto_docstring
+class Ideogram4TextEncoderStep(ModularPipelineBlocks):
+    """
+    Text encoder step that tokenizes the prompt(s) and runs the Qwen3-VL text encoder, returning the per-token text
+    features (concatenated from a fixed set of activation layers). Only the text tokens are encoded; the packed image
+    tokens are appended later (the encoder is causal with image after text, so they never affect the text features).
+      Components:
+          text_encoder (`Qwen3VLModel`): The Qwen3-VL text encoder. tokenizer (`Qwen2Tokenizer`): The tokenizer paired
+          with the text encoder.
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          max_sequence_length (`int`, *optional*, defaults to 2048):
+              Maximum sequence length for prompt encoding.
+      Outputs:
+          text_features (`Tensor`):
+              Per-prompt text features (B, max_sequence_length, llm_features_dim), padding zeroed.
+          text_lengths (`list`):
+              Per-prompt real text-token counts, used to lay out the packed sequence.
+    """
+    model_name = "ideogram4"
+    @property
+    def description(self) -> str:
+        return (
+            "Text encoder step that tokenizes the prompt(s) and runs the Qwen3-VL text encoder, returning the "
+            "per-token text features (concatenated from a fixed set of activation layers). Only the text tokens are "
+            "encoded; the packed image tokens are appended later (the encoder is causal with image after text, so "
+            "they never affect the text features)."
+        )
+    @property
+    def expected_components(self) -> list[ComponentSpec]:
+        return [
+            ComponentSpec("text_encoder", Qwen3VLModel, description="The Qwen3-VL text encoder."),
+            ComponentSpec("tokenizer", Qwen2Tokenizer, description="The tokenizer paired with the text encoder."),
+        ]
+    @property
+    def inputs(self) -> list[InputParam]:
+        return [
+            InputParam.template("prompt", required=True),
+            InputParam.template("max_sequence_length", default=2048),
+        ]
+    @property
+    def intermediate_outputs(self) -> list[OutputParam]:
+        return [
+            OutputParam(
+                name="text_features",
+                type_hint=torch.Tensor,
+                description="Per-prompt text features (B, max_sequence_length, llm_features_dim), padding zeroed.",
+            ),
+            OutputParam(
+                name="text_lengths",
+                type_hint=list,
+                description="Per-prompt real text-token counts, used to lay out the packed sequence.",
+            ),
+        ]
+    @staticmethod
+    # Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4.Ideogram4Pipeline._get_text_encoder_hidden_states
+    def _get_text_encoder_hidden_states(
+        text_encoder,
+        token_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        pos_2d: torch.Tensor,
+    ) -> list[torch.Tensor]:
+        """Run the text encoder's decoder layers, returning the hidden states tapped at each activation layer."""
+        language_model = text_encoder.language_model
+        inputs_embeds = language_model.embed_tokens(token_ids)
+        position_ids_4d = pos_2d[None, ...].expand(4, pos_2d.shape[0], -1)
+        text_position_ids = position_ids_4d[0]
+        mrope_position_ids = position_ids_4d[1:]
+        causal_mask = create_causal_mask(
+            config=language_model.config,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            past_key_values=None,
+            position_ids=text_position_ids,
+        )
+        position_embeddings = language_model.rotary_emb(inputs_embeds, mrope_position_ids)
+        tap_set = set(QWEN3_VL_ACTIVATION_LAYERS)
+        captured: dict[int, torch.Tensor] = {}
+        hidden_states = inputs_embeds
+        for layer_idx, decoder_layer in enumerate(language_model.layers):
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=text_position_ids,
+                past_key_values=None,
+                position_embeddings=position_embeddings,
+            )
+            if layer_idx in tap_set:
+                captured[layer_idx] = hidden_states
+        return [captured[i] for i in QWEN3_VL_ACTIVATION_LAYERS]
+    @torch.no_grad()
+    def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
+        block_state = self.get_block_state(state)
+        device = components._execution_device
+        tokenizer = components.tokenizer
+        max_text_tokens = block_state.max_sequence_length
+        prompts = [block_state.prompt] if isinstance(block_state.prompt, str) else list(block_state.prompt)
+        batch_size = len(prompts)
+        # Tokenize each chat-formatted prompt and left-pad to `max_sequence_length`.
+        token_ids = torch.zeros(batch_size, max_text_tokens, dtype=torch.long)
+        attention_mask = torch.zeros(batch_size, max_text_tokens, dtype=torch.long)
+        text_position_ids = torch.zeros(batch_size, max_text_tokens, dtype=torch.long)
+        text_lengths = []
+        for b, text_prompt in enumerate(prompts):
+            messages = [{"role": "user", "content": [{"type": "text", "text": text_prompt}]}]
+            text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            toks = tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
+            n = int(toks.shape[0])
+            if n > max_text_tokens:
+                raise ValueError(f"prompt has {n} tokens, exceeds max_sequence_length={max_text_tokens}")
+            text_lengths.append(n)
+            offset = max_text_tokens - n
+            token_ids[b, offset:] = toks
+            attention_mask[b, offset:] = 1
+            text_position_ids[b, offset:] = torch.arange(n)
+        token_ids = token_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        text_position_ids = text_position_ids.to(device)
+        # Run the text encoder, tapping the activation-layer hidden states, then concatenate them into per-token
+        # text features (padding zeroed).
+        selected = self._get_text_encoder_hidden_states(
+            components.text_encoder, token_ids, attention_mask, text_position_ids
+        )
+        text_features = torch.stack(selected, dim=0).permute(1, 2, 3, 0).reshape(batch_size, max_text_tokens, -1)
+        text_features = (text_features * attention_mask.to(text_features.dtype).unsqueeze(-1)).to(torch.float32)
+        block_state.text_features = text_features
+        block_state.text_lengths = text_lengths
+        self.set_block_state(state, block_state)
+        return components, state

diffusers_src/src/diffusers/modular_pipelines/ideogram4/modular_blocks_ideogram4.py ADDED Viewed

	@@ -0,0 +1,184 @@

+# Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ...utils import logging
+from ..modular_pipeline import SequentialPipelineBlocks
+from ..modular_pipeline_utils import InsertableDict, OutputParam
+from .before_denoise import (
+    Ideogram4PrepareAdditionalInputsStep,
+    Ideogram4PrepareLatentsStep,
+    Ideogram4SetTimestepsStep,
+    Ideogram4TextInputsStep,
+)
+from .decoders import Ideogram4DecodeStep
+from .denoise import Ideogram4AfterDenoiseStep, Ideogram4DenoiseStep
+from .encoders import Ideogram4PromptUpsampleStep, Ideogram4TextEncoderStep
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# Core denoise: consumes the per-prompt text features and produces the unpatchified latents
+# (batch/latents/timesteps/ids inputs -> denoising loop -> unpatchify).
+CORE_DENOISE_BLOCKS = InsertableDict(
+    [
+        ("input", Ideogram4TextInputsStep()),
+        ("prepare_latents", Ideogram4PrepareLatentsStep()),
+        ("set_timesteps", Ideogram4SetTimestepsStep()),
+        ("prepare_additional_inputs", Ideogram4PrepareAdditionalInputsStep()),
+        ("denoise", Ideogram4DenoiseStep()),
+        ("after_denoise", Ideogram4AfterDenoiseStep()),
+    ]
+)
+# auto_docstring
+class Ideogram4CoreDenoiseStep(SequentialPipelineBlocks):
+    """
+    Core denoising workflow for Ideogram4 text-to-image: prepares the batch/latents/timesteps and the packed denoiser
+    inputs, runs the asymmetric-CFG denoising loop over the conditional and unconditional transformers, and
+    unpatchifies the result for the decoder.
+      Components:
+          transformer (`Ideogram4Transformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
+          unconditional_transformer (`Ideogram4Transformer2DModel`)
+      Inputs:
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          text_features (`Tensor`):
+              Per-prompt text features from the encoder.
+          text_lengths (`list`):
+              Per-prompt text-token counts from the encoder.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 48):
+              The number of denoising steps.
+          mu (`float`, *optional*, defaults to 0.0):
+              Base mean of the logit-normal schedule.
+          std (`float`, *optional*, defaults to 1.5):
+              Std of the logit-normal schedule.
+          guidance_schedule (`list`, *optional*, defaults to (7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
+          7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
+          7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 3.0, 3.0, 3.0)):
+              Per-step guidance scale schedule (length num_inference_steps).
+      Outputs:
+          latents (`Tensor`):
+              Unpatchified (B, ae_channels, H, W) latents.
+    """
+    model_name = "ideogram4"
+    block_classes = list(CORE_DENOISE_BLOCKS.values())
+    block_names = list(CORE_DENOISE_BLOCKS.keys())
+    @property
+    def description(self) -> str:
+        return (
+            "Core denoising workflow for Ideogram4 text-to-image: prepares the batch/latents/timesteps and the packed "
+            "denoiser inputs, runs the asymmetric-CFG denoising loop over the conditional and unconditional "
+            "transformers, and unpatchifies the result for the decoder."
+        )
+    @property
+    def outputs(self) -> list[OutputParam]:
+        # The only meaningful product of the core step is the unpatchified latents; the batch/timesteps/packed-sequence
+        # inputs prepared along the way are consumed within the loop and are not updated by it.
+        return [OutputParam.template("latents", description="Unpatchified (B, ae_channels, H, W) latents.")]
+# auto_docstring
+class Ideogram4AutoBlocks(SequentialPipelineBlocks):
+    """
+    Auto Modular pipeline for text-to-image generation using Ideogram4: encode text -> core denoise (asymmetric CFG
+    over two transformers) -> decode.
+      Supported workflows:
+        - `text2image`: requires `prompt`
+      Components:
+          text_encoder (`Qwen3VLModel`): The Qwen3-VL text encoder. tokenizer (`Qwen2Tokenizer`): The tokenizer paired
+          with the text encoder. transformer (`Ideogram4Transformer2DModel`) scheduler
+          (`FlowMatchEulerDiscreteScheduler`) unconditional_transformer (`Ideogram4Transformer2DModel`) vae
+          (`AutoencoderKLFlux2`) image_processor (`VaeImageProcessor`)
+      Inputs:
+          prompt (`str`):
+              The prompt or prompts to guide image generation.
+          prompt_upsampling (`bool`, *optional*, defaults to False):
+              Rewrite the prompt into Ideogram4's native structured JSON caption before encoding.
+          prompt_upsampling_temperature (`float`, *optional*, defaults to 1.0):
+              Sampling temperature for prompt upsampling.
+          max_sequence_length (`int`, *optional*, defaults to 2048):
+              Maximum sequence length for prompt encoding.
+          num_images_per_prompt (`int`, *optional*, defaults to 1):
+              The number of images to generate per prompt.
+          latents (`Tensor`, *optional*):
+              Pre-generated noisy latents for image generation.
+          height (`int`):
+              The height in pixels of the generated image.
+          width (`int`):
+              The width in pixels of the generated image.
+          generator (`Generator`, *optional*):
+              Torch generator for deterministic generation.
+          num_inference_steps (`int`, *optional*, defaults to 48):
+              The number of denoising steps.
+          mu (`float`, *optional*, defaults to 0.0):
+              Base mean of the logit-normal schedule.
+          std (`float`, *optional*, defaults to 1.5):
+              Std of the logit-normal schedule.
+          guidance_schedule (`list`, *optional*, defaults to (7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
+          7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
+          7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 3.0, 3.0, 3.0)):
+              Per-step guidance scale schedule (length num_inference_steps).
+          output_type (`str`, *optional*, defaults to pil):
+              Output format: 'pil', 'np', 'pt'.
+      Outputs:
+          images (`list`):
+              Generated images.
+    """
+    model_name = "ideogram4"
+    block_classes = [
+        Ideogram4PromptUpsampleStep(),
+        Ideogram4TextEncoderStep(),
+        Ideogram4CoreDenoiseStep(),
+        Ideogram4DecodeStep(),
+    ]
+    block_names = ["prompt_upsample", "text_encoder", "denoise", "decode"]
+    # Workflow map declaring the trigger conditions for each supported workflow.
+    # `True` means the workflow triggers when the input is not None.
+    _workflow_map = {
+        "text2image": {"prompt": True},
+    }
+    @property
+    def description(self) -> str:
+        return (
+            "Auto Modular pipeline for text-to-image generation using Ideogram4: (optional) prompt upsampling -> "
+            "encode text -> core denoise (asymmetric CFG over two transformers) -> decode."
+        )
+    @property
+    def outputs(self) -> list[OutputParam]:
+        return [OutputParam.template("images")]

diffusers_src/src/diffusers/modular_pipelines/ideogram4/modular_pipeline.py ADDED Viewed

	@@ -0,0 +1,46 @@

+# Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from ..modular_pipeline import ModularPipeline
+class Ideogram4ModularPipeline(ModularPipeline):
+    """
+    A ModularPipeline for Ideogram4.
+    > [!WARNING] > This is an experimental feature!
+    """
+    default_blocks_name = "Ideogram4AutoBlocks"
+    # Ideogram4 patchifies the VAE output by a factor of 2 before feeding the transformer.
+    @property
+    def patch_size(self):
+        return 2
+    @property
+    def default_height(self):
+        return 2048
+    @property
+    def default_width(self):
+        return 2048
+    @property
+    def vae_scale_factor(self):
+        vae_scale_factor = 8
+        if getattr(self, "vae", None) is not None:
+            vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        return vae_scale_factor

diffusers_src/src/diffusers/modular_pipelines/modular_pipeline.py CHANGED Viewed

@@ -126,6 +126,7 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
         ("flux-kontext", _create_default_map_fn("FluxKontextModularPipeline")),
         ("flux2", _create_default_map_fn("Flux2ModularPipeline")),
         ("flux2-klein", _flux2_klein_map_fn),
         ("qwenimage", _create_default_map_fn("QwenImageModularPipeline")),
         ("qwenimage-edit", _create_default_map_fn("QwenImageEditModularPipeline")),
         ("qwenimage-edit-plus", _create_default_map_fn("QwenImageEditPlusModularPipeline")),

         ("flux-kontext", _create_default_map_fn("FluxKontextModularPipeline")),
         ("flux2", _create_default_map_fn("Flux2ModularPipeline")),
         ("flux2-klein", _flux2_klein_map_fn),
+        ("ideogram4", _create_default_map_fn("Ideogram4ModularPipeline")),
         ("qwenimage", _create_default_map_fn("QwenImageModularPipeline")),
         ("qwenimage-edit", _create_default_map_fn("QwenImageEditModularPipeline")),
         ("qwenimage-edit-plus", _create_default_map_fn("QwenImageEditPlusModularPipeline")),

diffusers_src/src/diffusers/pipelines/auto_pipeline.py CHANGED Viewed

@@ -59,6 +59,7 @@ from .flux2 import Flux2KleinPipeline, Flux2Pipeline
 from .glm_image import GlmImagePipeline
 from .helios import HeliosPipeline, HeliosPyramidPipeline
 from .hunyuandit import HunyuanDiTPipeline
 from .kandinsky import (
     KandinskyCombinedPipeline,
     KandinskyImg2ImgCombinedPipeline,
@@ -175,6 +176,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
         ("flux-kontext", FluxKontextPipeline),
         ("flux2-klein", Flux2KleinPipeline),
         ("flux2", Flux2Pipeline),
         ("lumina", LuminaPipeline),
         ("lumina2", Lumina2Pipeline),
         ("chroma", ChromaPipeline),

 from .glm_image import GlmImagePipeline
 from .helios import HeliosPipeline, HeliosPyramidPipeline
 from .hunyuandit import HunyuanDiTPipeline
+from .ideogram4 import Ideogram4Pipeline
 from .kandinsky import (
     KandinskyCombinedPipeline,
     KandinskyImg2ImgCombinedPipeline,
         ("flux-kontext", FluxKontextPipeline),
         ("flux2-klein", Flux2KleinPipeline),
         ("flux2", Flux2Pipeline),
+        ("ideogram4", Ideogram4Pipeline),
         ("lumina", LuminaPipeline),
         ("lumina2", Lumina2Pipeline),
         ("chroma", ChromaPipeline),

diffusers_src/src/diffusers/pipelines/ideogram4/pipeline_ideogram4.py CHANGED Viewed

@@ -29,11 +29,16 @@ from ...models.transformers.transformer_ideogram4 import (
     Ideogram4Transformer2DModel,
 )
 from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import is_outlines_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import Ideogram4PipelineOutput
-from .prompt_enhancer import CAPTION_SYSTEM_MESSAGE, CAPTION_USER_TEMPLATE, build_caption_logits_processor
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -43,10 +48,6 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # text conditioning consumed by the Ideogram4 transformer.
 QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
-# LM head grafted onto the (head-less) text encoder for optional prompt upsampling.
-DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
-PROMPT_UPSAMPLE_TEMPERATURE = 1.0
 EXAMPLE_DOC_STRING = """
     Examples:
@@ -161,7 +162,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
     """
     model_cpu_offload_seq = "text_encoder->transformer->unconditional_transformer->vae"
-    _optional_components = []
     _callback_tensor_inputs = ["latents"]
     def __init__(
@@ -172,6 +173,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
         tokenizer: AutoTokenizer,
         transformer: Ideogram4Transformer2DModel,
         unconditional_transformer: Ideogram4Transformer2DModel,
     ) -> None:
         super().__init__()
@@ -182,6 +184,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
             tokenizer=tokenizer,
             transformer=transformer,
             unconditional_transformer=unconditional_transformer,
         )
         self.vae_scale_factor = (
@@ -191,8 +194,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
         self.patch_size = 2
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * self.patch_size)
-        # Lazily built by `load_prompt_enhancer` for optional prompt upsampling.
-        self._caption_model = None
         self._caption_logits_processor = None
     def load_prompt_enhancer(
@@ -207,82 +209,45 @@ class Ideogram4Pipeline(DiffusionPipeline):
         Called automatically by `upsample_prompt` on first use. Generation is constrained to the caption JSON
         schema when `outlines` is installed; otherwise it falls back to unconstrained decoding with a warning.
         """
-        from accelerate import init_empty_weights
-        from huggingface_hub import hf_hub_download
-        from safetensors.torch import load_file
-        from transformers import Qwen3VLForConditionalGeneration
-        dtype = torch_dtype or self.text_encoder.dtype
-        head_weight = load_file(hf_hub_download(lm_head_repo_id, lm_head_filename))["lm_head.weight"].to(dtype)
-        with init_empty_weights():
-            caption_model = Qwen3VLForConditionalGeneration(self.text_encoder.config)
-        caption_model.model = self.text_encoder  # reuse the loaded encoder body
-        lm_head = torch.nn.Linear(head_weight.shape[1], head_weight.shape[0], bias=False)
-        with torch.no_grad():
-            lm_head.weight.copy_(head_weight)
-        caption_model.lm_head = lm_head.to(device=self.text_encoder.device, dtype=dtype)
-        caption_model.eval()
-        if is_outlines_available():
-            logits_processor = build_caption_logits_processor(caption_model, self.tokenizer)
-        else:
-            logits_processor = None
-            logger.warning(
-                "`outlines` is not installed; prompt upsampling will run unconstrained and may not return "
-                "schema-valid JSON. Install with `pip install outlines` for structured captions."
-            )
-        self._caption_model = caption_model
-        self._caption_logits_processor = logits_processor
-        return caption_model
     def upsample_prompt(
         self,
         prompt: str | list[str],
         height: int = 2048,
         width: int = 2048,
         max_new_tokens: int = 1024,
         lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
         device: torch.device | None = None,
     ) -> list[str]:
-        """Rewrite each prompt into Ideogram4's native structured JSON caption via the grafted text encoder."""
-        if self._caption_model is None:
             self.load_prompt_enhancer(lm_head_repo_id=lm_head_repo_id)
-        device = device or self._caption_model.device
-        prompts = [prompt] if isinstance(prompt, str) else list(prompt)
-        divisor = math.gcd(width, height) or 1
-        aspect_ratio = f"{width // divisor}:{height // divisor}"
-        captions = []
-        for text_prompt in prompts:
-            messages = [
-                {"role": "system", "content": CAPTION_SYSTEM_MESSAGE},
-                {
-                    "role": "user",
-                    "content": CAPTION_USER_TEMPLATE.format(aspect_ratio=aspect_ratio, original_prompt=text_prompt),
-                },
-            ]
-            inputs = self.tokenizer.apply_chat_template(
-                messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
-            ).to(device)
-            generate_kwargs = {
-                "max_new_tokens": max_new_tokens,
-                "do_sample": True,
-                "temperature": PROMPT_UPSAMPLE_TEMPERATURE,
-                "use_cache": True,
-            }
-            if self._caption_logits_processor is not None:
-                self._caption_logits_processor.reset()
-                generate_kwargs["logits_processor"] = [self._caption_logits_processor]
-            generated = self._caption_model.generate(**inputs, **generate_kwargs)
-            new_tokens = generated[:, inputs["input_ids"].shape[1] :]
-            captions.append(self.tokenizer.decode(new_tokens[0], skip_special_tokens=True).strip())
-        return captions
     def _prepare_ids(
-        self,
         text_lengths: list[int],
         grid_h: int,
         grid_w: int,
@@ -323,15 +288,16 @@ class Ideogram4Pipeline(DiffusionPipeline):
         return position_ids.to(device), segment_ids.to(device), indicator.to(device)
     def _get_text_encoder_hidden_states(
-        self,
         token_ids: torch.Tensor,
         attention_mask: torch.Tensor,
         pos_2d: torch.Tensor,
     ) -> list[torch.Tensor]:
         """Run the text encoder's decoder layers, returning the hidden states tapped at each activation layer."""
-        language_model = self.text_encoder.language_model
         inputs_embeds = language_model.embed_tokens(token_ids)
@@ -405,7 +371,9 @@ class Ideogram4Pipeline(DiffusionPipeline):
         text_position_ids = text_position_ids.to(device)
         # Concatenate the tapped activation-layer hidden states into per-token text features, zeroing padding.
-        selected = self._get_text_encoder_hidden_states(token_ids, attention_mask, text_position_ids)
         text_features = torch.stack(selected, dim=0).permute(1, 2, 3, 0).reshape(batch_size, max_sequence_length, -1)
         text_features = (text_features * attention_mask.to(text_features.dtype).unsqueeze(-1)).to(torch.float32)
@@ -509,6 +477,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
         mu: float = 0.0,
         std: float = 1.5,
         prompt_upsampling: bool = False,
         max_sequence_length: int = 2048,
         num_images_per_prompt: int = 1,
         generator: torch.Generator | list[torch.Generator] | None = None,
@@ -547,7 +516,10 @@ class Ideogram4Pipeline(DiffusionPipeline):
             prompt_upsampling (`bool`, *optional*, defaults to `False`):
                 If `True`, rewrite `prompt` into Ideogram4's native structured JSON caption via
                 [`~Ideogram4Pipeline.upsample_prompt`] before encoding. Requires the prompt-enhancer LM head
-                (downloaded on first use); install `outlines` for schema-constrained captions.
             max_sequence_length (`int`, *optional*, defaults to 2048):
                 Maximum number of text tokens per prompt.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -591,7 +563,14 @@ class Ideogram4Pipeline(DiffusionPipeline):
         # 0. Optionally rewrite the prompt(s) into Ideogram4's native structured JSON caption.
         if prompt_upsampling:
-            prompt = self.upsample_prompt(prompt, height=height, width=width, device=device)
         # 1. Image grid (drives both the packed layout and the latent shape).
         grid_h, grid_w = (

     Ideogram4Transformer2DModel,
 )
 from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import Ideogram4PipelineOutput
+from .prompt_enhancer import (
+    DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
+    PROMPT_UPSAMPLE_TEMPERATURE,
+    generate_captions,
+    graft_lm_head,
+)
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # text conditioning consumed by the Ideogram4 transformer.
 QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
 EXAMPLE_DOC_STRING = """
     Examples:
     """
     model_cpu_offload_seq = "text_encoder->transformer->unconditional_transformer->vae"
+    _optional_components = ["prompt_enhancer"]
     _callback_tensor_inputs = ["latents"]
     def __init__(
         tokenizer: AutoTokenizer,
         transformer: Ideogram4Transformer2DModel,
         unconditional_transformer: Ideogram4Transformer2DModel,
+        prompt_enhancer: PreTrainedModel | None = None,
     ) -> None:
         super().__init__()
             tokenizer=tokenizer,
             transformer=transformer,
             unconditional_transformer=unconditional_transformer,
+            prompt_enhancer=prompt_enhancer,
         )
         self.vae_scale_factor = (
         self.patch_size = 2
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * self.patch_size)
+        # Outlines logits processor derived from `prompt_enhancer`; rebuilt by `load_prompt_enhancer`.
         self._caption_logits_processor = None
     def load_prompt_enhancer(
         Called automatically by `upsample_prompt` on first use. Generation is constrained to the caption JSON
         schema when `outlines` is installed; otherwise it falls back to unconstrained decoding with a warning.
         """
+        prompt_enhancer, self._caption_logits_processor = graft_lm_head(
+            self.text_encoder, self.tokenizer, lm_head_repo_id, lm_head_filename, torch_dtype
+        )
+        self.register_modules(prompt_enhancer=prompt_enhancer)
+        return prompt_enhancer
     def upsample_prompt(
         self,
         prompt: str | list[str],
         height: int = 2048,
         width: int = 2048,
+        temperature: float = PROMPT_UPSAMPLE_TEMPERATURE,
         max_new_tokens: int = 1024,
+        generator: torch.Generator | list[torch.Generator] | None = None,
         lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
         device: torch.device | None = None,
     ) -> list[str]:
+        """Rewrite each prompt into Ideogram4's native structured JSON caption via the grafted text encoder.
+        Pass `generator` (the same one accepted by `__call__`) to make sampling reproducible.
+        """
+        if self.prompt_enhancer is None:
             self.load_prompt_enhancer(lm_head_repo_id=lm_head_repo_id)
+        return generate_captions(
+            self.prompt_enhancer,
+            self.tokenizer,
+            self._caption_logits_processor,
+            prompt,
+            height,
+            width,
+            temperature=temperature,
+            max_new_tokens=max_new_tokens,
+            generator=generator,
+            device=device,
+        )
+    @staticmethod
     def _prepare_ids(
         text_lengths: list[int],
         grid_h: int,
         grid_w: int,
         return position_ids.to(device), segment_ids.to(device), indicator.to(device)
+    @staticmethod
     def _get_text_encoder_hidden_states(
+        text_encoder,
         token_ids: torch.Tensor,
         attention_mask: torch.Tensor,
         pos_2d: torch.Tensor,
     ) -> list[torch.Tensor]:
         """Run the text encoder's decoder layers, returning the hidden states tapped at each activation layer."""
+        language_model = text_encoder.language_model
         inputs_embeds = language_model.embed_tokens(token_ids)
         text_position_ids = text_position_ids.to(device)
         # Concatenate the tapped activation-layer hidden states into per-token text features, zeroing padding.
+        selected = self._get_text_encoder_hidden_states(
+            self.text_encoder, token_ids, attention_mask, text_position_ids
+        )
         text_features = torch.stack(selected, dim=0).permute(1, 2, 3, 0).reshape(batch_size, max_sequence_length, -1)
         text_features = (text_features * attention_mask.to(text_features.dtype).unsqueeze(-1)).to(torch.float32)
         mu: float = 0.0,
         std: float = 1.5,
         prompt_upsampling: bool = False,
+        prompt_upsampling_temperature: float = PROMPT_UPSAMPLE_TEMPERATURE,
         max_sequence_length: int = 2048,
         num_images_per_prompt: int = 1,
         generator: torch.Generator | list[torch.Generator] | None = None,
             prompt_upsampling (`bool`, *optional*, defaults to `False`):
                 If `True`, rewrite `prompt` into Ideogram4's native structured JSON caption via
                 [`~Ideogram4Pipeline.upsample_prompt`] before encoding. Requires the prompt-enhancer LM head
+                (downloaded on first use); install `outlines` for schema-constrained captions. `generator` is
+                reused to make the upsampling reproducible.
+            prompt_upsampling_temperature (`float`, *optional*, defaults to 1.0):
+                Sampling temperature for prompt upsampling when `prompt_upsampling=True`.
             max_sequence_length (`int`, *optional*, defaults to 2048):
                 Maximum number of text tokens per prompt.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
         # 0. Optionally rewrite the prompt(s) into Ideogram4's native structured JSON caption.
         if prompt_upsampling:
+            prompt = self.upsample_prompt(
+                prompt,
+                height=height,
+                width=width,
+                temperature=prompt_upsampling_temperature,
+                generator=generator,
+                device=device,
+            )
         # 1. Image grid (drives both the packed layout and the latent shape).
         grid_h, grid_w = (

diffusers_src/src/diffusers/pipelines/ideogram4/prompt_enhancer.py CHANGED Viewed

@@ -20,8 +20,24 @@ Qwen3-VL text encoder grafted with a generative head (see `Ideogram4Pipeline.loa
 This mirrors the role of Flux2's `system_messages.py`, but the target is a constrained JSON object instead of
 free text, so `outlines` (an optional dependency) is used to guarantee a schema-valid result when available.
 """
 # System message that instructs the encoder to emit Ideogram4's native single-line JSON caption.
 CAPTION_SYSTEM_MESSAGE = """You convert a short user idea into a structured JSON caption for an image renderer. Output ONE minified single-line JSON object and NOTHING else (no markdown, no commentary).
@@ -107,3 +123,102 @@ def build_caption_logits_processor(model, tokenizer):
     outlines_model = outlines.from_transformers(model, tokenizer)
     return outlines.Generator(outlines_model, Caption).logits_processor

 This mirrors the role of Flux2's `system_messages.py`, but the target is a constrained JSON object instead of
 free text, so `outlines` (an optional dependency) is used to guarantee a schema-valid result when available.
+The graft/generate helpers here are shared by `Ideogram4Pipeline` and the modular `Ideogram4PromptUpsampleStep`.
 """
+import math
+import torch
+from ...utils import is_outlines_available, logging
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+# Qwen3-VL LM head grafted onto the (head-less) text encoder for prompt upsampling.
+DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO = "diffusers/qwen3-vl-8b-instruct-lm-head"
+PROMPT_UPSAMPLE_TEMPERATURE = 1.0
 # System message that instructs the encoder to emit Ideogram4's native single-line JSON caption.
 CAPTION_SYSTEM_MESSAGE = """You convert a short user idea into a structured JSON caption for an image renderer. Output ONE minified single-line JSON object and NOTHING else (no markdown, no commentary).
     outlines_model = outlines.from_transformers(model, tokenizer)
     return outlines.Generator(outlines_model, Caption).logits_processor
+def graft_lm_head(
+    text_encoder,
+    tokenizer,
+    lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
+    lm_head_filename: str = "lm_head.safetensors",
+    torch_dtype: torch.dtype | None = None,
+):
+    """Graft a hosted LM head onto the (head-less) Qwen3-VL `text_encoder` to make it generative.
+    Returns `(prompt_enhancer, logits_processor)`. The encoder body is shared (only the head is loaded). The
+    logits processor constrains generation to the caption JSON schema when `outlines` is installed; otherwise it
+    is `None` and generation runs unconstrained (a warning is logged).
+    """
+    from accelerate import init_empty_weights
+    from huggingface_hub import hf_hub_download
+    from safetensors.torch import load_file
+    from transformers import Qwen3VLForConditionalGeneration
+    dtype = torch_dtype or text_encoder.dtype
+    head_weight = load_file(hf_hub_download(lm_head_repo_id, lm_head_filename))["lm_head.weight"].to(dtype)
+    with init_empty_weights():
+        prompt_enhancer = Qwen3VLForConditionalGeneration(text_encoder.config)
+    prompt_enhancer.model = text_encoder  # reuse the loaded encoder body
+    lm_head = torch.nn.Linear(head_weight.shape[1], head_weight.shape[0], bias=False)
+    with torch.no_grad():
+        lm_head.weight.copy_(head_weight)
+    prompt_enhancer.lm_head = lm_head.to(device=text_encoder.device, dtype=dtype)
+    prompt_enhancer.eval()
+    if is_outlines_available():
+        logits_processor = build_caption_logits_processor(prompt_enhancer, tokenizer)
+    else:
+        logits_processor = None
+        logger.warning(
+            "`outlines` is not installed; prompt upsampling will run unconstrained and may not return "
+            "schema-valid JSON. Install with `pip install outlines` for structured captions."
+        )
+    return prompt_enhancer, logits_processor
+def generate_captions(
+    prompt_enhancer,
+    tokenizer,
+    logits_processor,
+    prompt: str | list[str],
+    height: int,
+    width: int,
+    temperature: float = PROMPT_UPSAMPLE_TEMPERATURE,
+    max_new_tokens: int = 1024,
+    generator: torch.Generator | list[torch.Generator] | None = None,
+    device: torch.device | None = None,
+) -> list[str]:
+    """Rewrite each prompt into the native structured JSON caption with the grafted `prompt_enhancer`.
+    Pass `generator` to make sampling reproducible (a seed is derived from it and used inside a forked RNG so the
+    caller's own RNG stream is untouched).
+    """
+    device = device or prompt_enhancer.device
+    prompts = [prompt] if isinstance(prompt, str) else list(prompt)
+    divisor = math.gcd(width, height) or 1
+    aspect_ratio = f"{width // divisor}:{height // divisor}"
+    sampling_seed = None
+    if generator is not None:
+        gen = generator[0] if isinstance(generator, list) else generator
+        sampling_seed = int(torch.randint(0, 2**63 - 1, (1,), generator=gen, device=gen.device).item())
+    fork_devices = [device] if getattr(device, "type", None) == "cuda" else []
+    captions = []
+    for i, text_prompt in enumerate(prompts):
+        messages = [
+            {"role": "system", "content": CAPTION_SYSTEM_MESSAGE},
+            {
+                "role": "user",
+                "content": CAPTION_USER_TEMPLATE.format(aspect_ratio=aspect_ratio, original_prompt=text_prompt),
+            },
+        ]
+        inputs = tokenizer.apply_chat_template(
+            messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+        ).to(device)
+        generate_kwargs = {
+            "max_new_tokens": max_new_tokens,
+            "do_sample": temperature > 0,
+            "temperature": temperature,
+            "use_cache": True,
+        }
+        if logits_processor is not None:
+            logits_processor.reset()
+            generate_kwargs["logits_processor"] = [logits_processor]
+        with torch.random.fork_rng(devices=fork_devices, enabled=sampling_seed is not None):
+            if sampling_seed is not None:
+                torch.manual_seed(sampling_seed + i)
+            generated = prompt_enhancer.generate(**inputs, **generate_kwargs)
+        new_tokens = generated[:, inputs["input_ids"].shape[1] :]
+        captions.append(tokenizer.decode(new_tokens[0], skip_special_tokens=True).strip())
+    return captions

diffusers_src/src/diffusers/utils/dummy_torch_and_transformers_objects.py CHANGED Viewed

@@ -332,6 +332,36 @@ class HunyuanVideo15ModularPipeline(metaclass=DummyObject):
         requires_backends(cls, ["torch", "transformers"])
 class LTXAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]

         requires_backends(cls, ["torch", "transformers"])
+class Ideogram4AutoBlocks(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+class Ideogram4ModularPipeline(metaclass=DummyObject):
+    _backends = ["torch", "transformers"]
+    def __init__(self, *args, **kwargs):
+        requires_backends(self, ["torch", "transformers"])
+    @classmethod
+    def from_config(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
+    @classmethod
+    def from_pretrained(cls, *args, **kwargs):
+        requires_backends(cls, ["torch", "transformers"])
 class LTXAutoBlocks(metaclass=DummyObject):
     _backends = ["torch", "transformers"]