Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

multimodalart HF Staff commited on Jun 3

Commit

168566a

verified ·

1 Parent(s): e99a766

Bundle PR diffusers (yiyi-refactor-fused + native prompt upsampling)

Browse files

Files changed (6) hide show

diffusers_src/src/diffusers/models/transformers/transformer_ideogram4.py +85 -57
diffusers_src/src/diffusers/pipelines/ideogram4/pipeline_ideogram4.py +329 -184
diffusers_src/src/diffusers/pipelines/ideogram4/prompt_enhancer.py +109 -0
diffusers_src/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py +1 -3
diffusers_src/src/diffusers/utils/__init__.py +1 -0
diffusers_src/src/diffusers/utils/import_utils.py +5 -0

diffusers_src/src/diffusers/models/transformers/transformer_ideogram4.py CHANGED Viewed

@@ -12,6 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import math
 import torch
@@ -22,6 +23,8 @@ from ...configuration_utils import ConfigMixin, register_to_config
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import logging
 from ...utils.torch_utils import maybe_allow_in_graph
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import RMSNorm
@@ -44,19 +47,6 @@ def _rotate_half(x: torch.Tensor) -> torch.Tensor:
     return torch.cat((-x[..., half:], x[..., :half]), dim=-1)
-def _apply_rotary_pos_emb(
-    q: torch.Tensor,
-    k: torch.Tensor,
-    cos: torch.Tensor,
-    sin: torch.Tensor,
-) -> tuple[torch.Tensor, torch.Tensor]:
-    cos = cos.unsqueeze(1)
-    sin = sin.unsqueeze(1)
-    q_embed = (q * cos) + (_rotate_half(q) * sin)
-    k_embed = (k * cos) + (_rotate_half(k) * sin)
-    return q_embed, k_embed
 class Ideogram4MRoPE(nn.Module):
     """Multi-axis (t, h, w) interleaved rotary position embedding."""
@@ -74,7 +64,6 @@ class Ideogram4MRoPE(nn.Module):
         self.mrope_section = tuple(mrope_section)
         self.head_dim = head_dim
-    @torch.no_grad()
     def forward(self, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         # position_ids: (B, L, 3) of int (axes are t, h, w).
         if position_ids.ndim != 3 or position_ids.shape[-1] != 3:
@@ -97,8 +86,49 @@ class Ideogram4MRoPE(nn.Module):
         return emb.cos(), emb.sin()
-class Ideogram4Attention(nn.Module):
-    """Self-attention with merged QKV, q/k RMSNorm, MRoPE and block-diagonal mask."""
     def __init__(self, hidden_size: int, num_heads: int, eps: float = 1e-5) -> None:
         super().__init__()
@@ -113,34 +143,23 @@ class Ideogram4Attention(nn.Module):
         self.norm_k = RMSNorm(self.head_dim, eps=eps, elementwise_affine=True)
         self.o = nn.Linear(hidden_size, hidden_size, bias=False)
     def forward(
         self,
         hidden_states: torch.Tensor,
-        segment_ids: torch.Tensor,
-        cos: torch.Tensor,
-        sin: torch.Tensor,
     ) -> torch.Tensor:
-        batch_size, seq_len, _ = hidden_states.shape
-        qkv = self.qkv(hidden_states).view(batch_size, seq_len, 3, self.num_heads, self.head_dim)
-        q, k, v = qkv.unbind(dim=2)
-        q = self.norm_q(q)
-        k = self.norm_k(k)
-        # SDPA expects (B, num_heads, L, head_dim).
-        q = q.transpose(1, 2)
-        k = k.transpose(1, 2)
-        v = v.transpose(1, 2)
-        q, k = _apply_rotary_pos_emb(q, k, cos, sin)
-        # Block-diagonal mask from segment ids: tokens only attend within their segment.
-        attn_mask = (segment_ids.unsqueeze(2) == segment_ids.unsqueeze(1)).unsqueeze(1)
-        out = F.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask)
-        out = out.transpose(1, 2).reshape(batch_size, seq_len, self.hidden_size)
-        return self.o(out)
 class Ideogram4MLP(nn.Module):
@@ -180,9 +199,8 @@ class Ideogram4TransformerBlock(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
-        segment_ids: torch.Tensor,
-        cos: torch.Tensor,
-        sin: torch.Tensor,
         adaln_input: torch.Tensor,
     ) -> torch.Tensor:
         mod = self.adaln_modulation(adaln_input)
@@ -194,9 +212,8 @@ class Ideogram4TransformerBlock(nn.Module):
         attn_out = self.attention(
             self.attention_norm1(hidden_states) * scale_msa,
-            segment_ids=segment_ids,
-            cos=cos,
-            sin=sin,
         )
         hidden_states = hidden_states + gate_msa * self.attention_norm2(attn_out)
         hidden_states = hidden_states + gate_mlp * self.ffn_norm2(
@@ -251,7 +268,7 @@ class Ideogram4FinalLayer(nn.Module):
         return self.linear(self.norm_final(hidden_states) * scale)
-class Ideogram4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
     The flow-matching transformer backbone used by the Ideogram 4 pipeline.
@@ -346,6 +363,19 @@ class Ideogram4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
             adaln_dim=adaln_dim,
         )
     def forward(
         self,
         hidden_states: torch.Tensor,
@@ -377,19 +407,13 @@ class Ideogram4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
         Returns:
             [`~models.modeling_outputs.Transformer2DModelOutput`] or a `tuple` whose first element is a tensor of shape
-            `(batch_size, sequence_length, in_channels)` in float32. Only positions tagged with
             `OUTPUT_IMAGE_INDICATOR` carry meaningful velocity predictions.
         """
         batch_size, seq_len, in_channels = hidden_states.shape
         if in_channels != self.in_channels:
             raise ValueError(f"Expected last dim {self.in_channels}, got {in_channels}.")
-        param_dtype = self.dtype
-        hidden_states = hidden_states.to(param_dtype)
-        timestep = timestep.to(param_dtype)
-        encoder_hidden_states = encoder_hidden_states.to(param_dtype)
-        indicator = indicator.to(torch.long)
         llm_token_mask = (indicator == LLM_TOKEN_INDICATOR).to(hidden_states.dtype).unsqueeze(-1)
         output_image_mask = (indicator == OUTPUT_IMAGE_INDICATOR).to(hidden_states.dtype).unsqueeze(-1)
@@ -414,16 +438,20 @@ class Ideogram4Transformer2DModel(ModelMixin, ConfigMixin, PeftAdapterMixin, Fro
         cos, sin = self.rotary_emb(position_ids)
         cos = cos.to(hidden_states.dtype)
         sin = sin.to(hidden_states.dtype)
         for block in self.layers:
             if torch.is_grad_enabled() and self.gradient_checkpointing:
                 hidden_states = self._gradient_checkpointing_func(
-                    block, hidden_states, segment_ids, cos, sin, adaln_input
                 )
             else:
-                hidden_states = block(hidden_states, segment_ids, cos, sin, adaln_input)
-        output = self.final_layer(hidden_states, conditioning=adaln_input).to(torch.float32)
         if not return_dict:
             return (output,)

 # See the License for the specific language governing permissions and
 # limitations under the License.
+import inspect
 import math
 import torch
 from ...loaders import FromOriginalModelMixin, PeftAdapterMixin
 from ...utils import logging
 from ...utils.torch_utils import maybe_allow_in_graph
+from ..attention import AttentionMixin, AttentionModuleMixin
+from ..attention_dispatch import dispatch_attention_fn
 from ..modeling_outputs import Transformer2DModelOutput
 from ..modeling_utils import ModelMixin
 from ..normalization import RMSNorm
     return torch.cat((-x[..., half:], x[..., :half]), dim=-1)
 class Ideogram4MRoPE(nn.Module):
     """Multi-axis (t, h, w) interleaved rotary position embedding."""
         self.mrope_section = tuple(mrope_section)
         self.head_dim = head_dim
     def forward(self, position_ids: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
         # position_ids: (B, L, 3) of int (axes are t, h, w).
         if position_ids.ndim != 3 or position_ids.shape[-1] != 3:
         return emb.cos(), emb.sin()
+class Ideogram4AttnProcessor:
+    _attention_backend = None
+    _parallel_config = None
+    def __call__(
+        self,
+        attn: "Ideogram4Attention",
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_rotary_emb: tuple[torch.Tensor, torch.Tensor],
+    ) -> torch.Tensor:
+        batch_size, seq_len, _ = hidden_states.shape
+        qkv = attn.qkv(hidden_states).view(batch_size, seq_len, 3, attn.num_heads, attn.head_dim)
+        query, key, value = qkv.unbind(dim=2)
+        query = attn.norm_q(query)
+        key = attn.norm_k(key)
+        # MRoPE applied in (B, L, num_heads, head_dim) layout; cos/sin broadcast over the head axis.
+        cos, sin = image_rotary_emb
+        cos = cos.unsqueeze(2)
+        sin = sin.unsqueeze(2)
+        query = (query * cos) + (_rotate_half(query) * sin)
+        key = (key * cos) + (_rotate_half(key) * sin)
+        hidden_states = dispatch_attention_fn(
+            query,
+            key,
+            value,
+            attn_mask=attention_mask,
+            backend=self._attention_backend,
+            parallel_config=self._parallel_config,
+        )
+        hidden_states = hidden_states.flatten(2, 3)
+        return attn.o(hidden_states)
+class Ideogram4Attention(nn.Module, AttentionModuleMixin):
+    """Self-attention with merged QKV, q/k RMSNorm, MRoPE and a block-diagonal segment mask."""
+    _default_processor_cls = Ideogram4AttnProcessor
+    _available_processors = [Ideogram4AttnProcessor]
     def __init__(self, hidden_size: int, num_heads: int, eps: float = 1e-5) -> None:
         super().__init__()
         self.norm_k = RMSNorm(self.head_dim, eps=eps, elementwise_affine=True)
         self.o = nn.Linear(hidden_size, hidden_size, bias=False)
+        self.set_processor(self._default_processor_cls())
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor | None = None,
+        image_rotary_emb: tuple[torch.Tensor, torch.Tensor] | None = None,
+        **kwargs,
     ) -> torch.Tensor:
+        attn_parameters = set(inspect.signature(self.processor.__call__).parameters.keys())
+        unused_kwargs = [k for k in kwargs if k not in attn_parameters]
+        if len(unused_kwargs) > 0:
+            logger.warning(
+                f"attention_kwargs {unused_kwargs} are not expected by {self.processor.__class__.__name__} and will be ignored."
+            )
+        kwargs = {k: w for k, w in kwargs.items() if k in attn_parameters}
+        return self.processor(self, hidden_states, attention_mask, image_rotary_emb, **kwargs)
 class Ideogram4MLP(nn.Module):
     def forward(
         self,
         hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        image_rotary_emb: tuple[torch.Tensor, torch.Tensor],
         adaln_input: torch.Tensor,
     ) -> torch.Tensor:
         mod = self.adaln_modulation(adaln_input)
         attn_out = self.attention(
             self.attention_norm1(hidden_states) * scale_msa,
+            attention_mask=attention_mask,
+            image_rotary_emb=image_rotary_emb,
         )
         hidden_states = hidden_states + gate_msa * self.attention_norm2(attn_out)
         hidden_states = hidden_states + gate_mlp * self.ffn_norm2(
         return self.linear(self.norm_final(hidden_states) * scale)
+class Ideogram4Transformer2DModel(ModelMixin, ConfigMixin, AttentionMixin, PeftAdapterMixin, FromOriginalModelMixin):
     r"""
     The flow-matching transformer backbone used by the Ideogram 4 pipeline.
             adaln_dim=adaln_dim,
         )
+    def fuse_qkv_projections(self):
+        # The attention already uses a single fused `qkv` projection, so there is nothing to fuse.
+        raise NotImplementedError(
+            "Ideogram4Transformer2DModel already uses a fused QKV projection (`attention.qkv`), "
+            "so `fuse_qkv_projections()` is not applicable."
+        )
+    def unfuse_qkv_projections(self):
+        raise NotImplementedError(
+            "Ideogram4Transformer2DModel uses a fused QKV projection that cannot be split, "
+            "so `unfuse_qkv_projections()` is not applicable."
+        )
     def forward(
         self,
         hidden_states: torch.Tensor,
         Returns:
             [`~models.modeling_outputs.Transformer2DModelOutput`] or a `tuple` whose first element is a tensor of shape
+            `(batch_size, sequence_length, in_channels)` in the model's compute dtype. Only positions tagged with
             `OUTPUT_IMAGE_INDICATOR` carry meaningful velocity predictions.
         """
         batch_size, seq_len, in_channels = hidden_states.shape
         if in_channels != self.in_channels:
             raise ValueError(f"Expected last dim {self.in_channels}, got {in_channels}.")
         llm_token_mask = (indicator == LLM_TOKEN_INDICATOR).to(hidden_states.dtype).unsqueeze(-1)
         output_image_mask = (indicator == OUTPUT_IMAGE_INDICATOR).to(hidden_states.dtype).unsqueeze(-1)
         cos, sin = self.rotary_emb(position_ids)
         cos = cos.to(hidden_states.dtype)
         sin = sin.to(hidden_states.dtype)
+        image_rotary_emb = (cos, sin)
+        # Block-diagonal mask from segment ids: tokens only attend within their segment. Shared by every block.
+        attention_mask = (segment_ids.unsqueeze(2) == segment_ids.unsqueeze(1)).unsqueeze(1)
         for block in self.layers:
             if torch.is_grad_enabled() and self.gradient_checkpointing:
                 hidden_states = self._gradient_checkpointing_func(
+                    block, hidden_states, attention_mask, image_rotary_emb, adaln_input
                 )
             else:
+                hidden_states = block(hidden_states, attention_mask, image_rotary_emb, adaln_input)
+        output = self.final_layer(hidden_states, conditioning=adaln_input)
         if not return_dict:
             return (output,)

diffusers_src/src/diffusers/pipelines/ideogram4/pipeline_ideogram4.py CHANGED Viewed

@@ -29,10 +29,11 @@ from ...models.transformers.transformer_ideogram4 import (
     Ideogram4Transformer2DModel,
 )
 from ...schedulers import FlowMatchEulerDiscreteScheduler
-from ...utils import logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import Ideogram4PipelineOutput
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
@@ -42,10 +43,9 @@ logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # text conditioning consumed by the Ideogram4 transformer.
 QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
-DEFAULT_NUM_INFERENCE_STEPS = 48
-DEFAULT_GUIDANCE_SCHEDULE = (7.0,) * 45 + (3.0,) * 3
-DEFAULT_MU = 0.0
-DEFAULT_STD = 1.5
 EXAMPLE_DOC_STRING = """
@@ -109,6 +109,32 @@ def _resolution_aware_mu(
     return base_mu + 0.5 * math.log(num_pixels / base_pixels)
 class Ideogram4Pipeline(DiffusionPipeline):
     r"""
     Text-to-image pipeline for Ideogram4.
@@ -165,38 +191,110 @@ class Ideogram4Pipeline(DiffusionPipeline):
         self.patch_size = 2
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * self.patch_size)
-    @property
-    def _patch_dim(self) -> int:
-        return self.vae_scale_factor * self.patch_size
-    def _tokenize(self, prompt: str, max_text_tokens: int) -> tuple[torch.Tensor, int]:
-        """Build chat-formatted token ids for a single prompt."""
-        messages = [{"role": "user", "content": [{"type": "text", "text": prompt}]}]
-        text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
-        encoded = self.tokenizer(text, return_tensors="pt", add_special_tokens=False)
-        token_ids = encoded["input_ids"][0]
-        num_text_tokens = int(token_ids.shape[0])
-        if num_text_tokens > max_text_tokens:
-            raise ValueError(f"prompt has {num_text_tokens} tokens, exceeds max_sequence_length={max_text_tokens}")
-        return token_ids, num_text_tokens
-    def _build_inputs(
         self,
-        prompts: list[str],
-        height: int,
-        width: int,
         max_text_tokens: int,
         device: torch.device,
-    ) -> dict[str, torch.Tensor]:
-        """Build the packed sequence (left-padded text tokens then image tokens) for one batch."""
-        tokenized = [self._tokenize(p, max_text_tokens) for p in prompts]
-        batch_size = len(prompts)
-        patch = self._patch_dim
-        if height % patch != 0 or width % patch != 0:
-            raise ValueError(f"height/width must be divisible by vae_scale_factor*patch_size={patch}")
-        grid_h = height // patch
-        grid_w = width // patch
         num_image_tokens = grid_h * grid_w
         total_seq_len = max_text_tokens + num_image_tokens
@@ -206,21 +304,15 @@ class Ideogram4Pipeline(DiffusionPipeline):
         t_idx = torch.zeros_like(h_idx)
         image_pos = torch.stack([t_idx, h_idx, w_idx], dim=1) + IMAGE_POSITION_OFFSET
-        token_ids = torch.zeros(batch_size, total_seq_len, dtype=torch.long)
-        text_position_ids = torch.zeros(batch_size, total_seq_len, 3, dtype=torch.long)
         position_ids = torch.zeros(batch_size, total_seq_len, 3, dtype=torch.long)
         segment_ids = torch.full((batch_size, total_seq_len), SEQUENCE_PADDING_INDICATOR, dtype=torch.long)
         indicator = torch.zeros(batch_size, total_seq_len, dtype=torch.long)
-        for b, (toks, num_text) in enumerate(tokenized):
-            pad_len = max_text_tokens - num_text
-            offset = pad_len
-            token_ids[b, offset : offset + num_text] = toks
             text_pos = torch.arange(num_text)
             text_pos_3d = torch.stack([text_pos, text_pos, text_pos], dim=1)
-            text_position_ids[b, offset : offset + num_text] = text_pos_3d
             position_ids[b, offset : offset + num_text] = text_pos_3d
             position_ids[b, offset + num_text :] = image_pos
@@ -229,16 +321,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
             segment_ids[b, offset : offset + num_text + num_image_tokens] = 1
-        return {
-            "token_ids": token_ids.to(device),
-            "text_position_ids": text_position_ids.to(device),
-            "position_ids": position_ids.to(device),
-            "segment_ids": segment_ids.to(device),
-            "indicator": indicator.to(device),
-            "num_image_tokens": num_image_tokens,
-            "grid_h": grid_h,
-            "grid_w": grid_w,
-        }
     def _get_text_encoder_hidden_states(
         self,
@@ -283,28 +366,60 @@ class Ideogram4Pipeline(DiffusionPipeline):
     def encode_prompt(
         self,
-        prompts: list[str],
-        token_ids: torch.Tensor,
-        text_position_ids: torch.Tensor,
-        indicator: torch.Tensor,
         device: torch.device,
-    ) -> torch.Tensor:
-        """Encode prompts using the text encoder and stack hidden states from the activation layers."""
-        batch_size, seq_len = token_ids.shape
-        attention_mask = (indicator == LLM_TOKEN_INDICATOR).to(torch.long)
-        pos_2d = text_position_ids[..., 0].contiguous()
-        with torch.no_grad():
-            selected = self._get_text_encoder_hidden_states(token_ids, attention_mask, pos_2d)
-        stacked = torch.stack(selected, dim=0)  # (num_taps, B, L, H)
-        stacked = stacked.permute(1, 2, 3, 0)
-        stacked = stacked.reshape(batch_size, seq_len, -1)
-        # Zero out non-LLM positions so the transformer only sees real text features.
-        text_mask = attention_mask.to(stacked.dtype).unsqueeze(-1)
-        stacked = stacked * text_mask
-        return stacked.to(torch.float32)
     def prepare_latents(
         self,
@@ -325,27 +440,6 @@ class Ideogram4Pipeline(DiffusionPipeline):
             latents = latents.to(device=device, dtype=dtype)
         return latents
-    def _decode(self, z: torch.Tensor, grid_h: int, grid_w: int) -> torch.Tensor:
-        """Unpatch latents, denormalize with the VAE batch-norm stats, and decode through the VAE."""
-        batch_size = z.shape[0]
-        patch = self.patch_size
-        # VAE bn stores per-channel statistics on the packed-channel latent space (ae_channels * patch ** 2).
-        bn_mean = self.vae.bn.running_mean.view(1, 1, -1).to(device=z.device, dtype=z.dtype)
-        bn_std = torch.sqrt(self.vae.bn.running_var + self.vae.config.batch_norm_eps).view(1, 1, -1)
-        bn_std = bn_std.to(device=z.device, dtype=z.dtype)
-        z = z * bn_std + bn_mean
-        ae_channels = z.shape[-1] // (patch * patch)
-        z = z.view(batch_size, grid_h, grid_w, patch, patch, ae_channels)
-        z = z.permute(0, 5, 1, 3, 2, 4).contiguous()
-        z = z.view(batch_size, ae_channels, grid_h * patch, grid_w * patch)
-        z = z.to(self.vae.dtype)
-        image = self.vae.decode(z, return_dict=False)[0]
-        return image
     @property
     def guidance_scale(self) -> float | None:
         return self._guidance_scale
@@ -358,6 +452,50 @@ class Ideogram4Pipeline(DiffusionPipeline):
     def interrupt(self) -> bool:
         return self._interrupt
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
@@ -365,11 +503,12 @@ class Ideogram4Pipeline(DiffusionPipeline):
         prompt: str | list[str] | None = None,
         height: int = 2048,
         width: int = 2048,
-        num_inference_steps: int = DEFAULT_NUM_INFERENCE_STEPS,
         guidance_scale: float | None = None,
-        guidance_schedule: list[float] | torch.Tensor | None = DEFAULT_GUIDANCE_SCHEDULE,
-        mu: float = DEFAULT_MU,
-        std: float = DEFAULT_STD,
         max_sequence_length: int = 2048,
         num_images_per_prompt: int = 1,
         generator: torch.Generator | list[torch.Generator] | None = None,
@@ -377,7 +516,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
         output_type: str = "pil",
         return_dict: bool = True,
         callback_on_step_end: Callable[["Ideogram4Pipeline", int, int, dict[str, Any]], dict[str, Any]] | None = None,
-        callback_on_step_end_tensor_inputs: list[str] | None = None,
     ) -> Ideogram4PipelineOutput | tuple[Any]:
         r"""
         Run text-to-image generation.
@@ -396,16 +535,19 @@ class Ideogram4Pipeline(DiffusionPipeline):
                 velocity predictions are blended as `v = guidance_scale * v_pos + (1 - guidance_scale) * v_neg`.
                 Mutually exclusive with `guidance_schedule` (setting both raises). Defaults to `None`.
             guidance_schedule (`list[float]` or `torch.Tensor`, *optional*):
-                Per-step guidance scale schedule; must have length `num_inference_steps`. The first entry corresponds to
-                the first step (largest noise level). Mutually exclusive with `guidance_scale` (setting both raises).
-                Exactly one of `guidance_scale` and `guidance_schedule` must be set; leaving both unset raises. The
-                recommended schedule for best quality is `DEFAULT_GUIDANCE_SCHEDULE` (7.0 for the main steps, dropping
-                to 3.0 for the final 3 "polish" steps).
             mu (`float`, *optional*, defaults to 0.0):
                 Base mean of the logit-normal flow-matching schedule. The schedule mean is shifted by half the log of
                 the resolution ratio relative to 512x512.
             std (`float`, *optional*, defaults to 1.5):
                 Standard deviation of the logit-normal flow-matching schedule.
             max_sequence_length (`int`, *optional*, defaults to 2048):
                 Maximum number of text tokens per prompt.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -428,66 +570,63 @@ class Ideogram4Pipeline(DiffusionPipeline):
         Returns:
             [`~pipelines.ideogram4.Ideogram4PipelineOutput`] or `tuple`.
         """
-        if prompt is None:
-            raise ValueError("`prompt` must be provided.")
-        if isinstance(prompt, str):
-            prompts = [prompt]
-        else:
-            prompts = list(prompt)
-        if num_inference_steps <= 0:
-            raise ValueError(f"`num_inference_steps` must be > 0, got {num_inference_steps}.")
-        if num_images_per_prompt <= 0:
-            raise ValueError(f"`num_images_per_prompt` must be > 0, got {num_images_per_prompt}.")
-        if guidance_scale is not None and guidance_schedule is not None:
-            raise ValueError(
-                "Only one of `guidance_scale` and `guidance_schedule` may be set."
-            )
-        if guidance_scale is None and guidance_schedule is None:
-            raise ValueError(
-                "One of `guidance_scale` and `guidance_schedule` must be set."
-            )
-        callback_on_step_end_tensor_inputs = callback_on_step_end_tensor_inputs or ["latents"]
-        patch = self._patch_dim
-        if height % patch != 0 or width % patch != 0:
-            raise ValueError(
-                f"`height` ({height}) and `width` ({width}) must both be divisible by {patch} "
-                f"(vae_scale_factor * patch_size)."
-            )
         device = self._execution_device
         self._guidance_scale = guidance_scale
         self._interrupt = False
-        # 1. Build packed input layout shared by the conditional pass.
-        inputs = self._build_inputs(
-            prompts=prompts,
-            height=height,
-            width=width,
-            max_text_tokens=max_sequence_length,
-            device=device,
         )
-        batch_size = len(prompts)
-        num_image_tokens = inputs["num_image_tokens"]
-        grid_h, grid_w = inputs["grid_h"], inputs["grid_w"]
-        # 2. Encode prompts.
-        llm_features = self.encode_prompt(
-            prompts=prompts,
-            token_ids=inputs["token_ids"],
-            text_position_ids=inputs["text_position_ids"],
-            indicator=inputs["indicator"],
             device=device,
         )
-        # 3. Replicate per-prompt tensors for num_images_per_prompt.
-        if num_images_per_prompt > 1:
-            llm_features = llm_features.repeat_interleave(num_images_per_prompt, dim=0)
-            for key in ("position_ids", "segment_ids", "indicator"):
-                inputs[key] = inputs[key].repeat_interleave(num_images_per_prompt, dim=0)
-        effective_batch_size = batch_size * num_images_per_prompt
         # 4. Set up the resolution-aware logit-normal schedule on the scheduler.
         schedule_mu = _resolution_aware_mu(height=height, width=width, base_mu=mu)
@@ -496,21 +635,16 @@ class Ideogram4Pipeline(DiffusionPipeline):
         timesteps = self.scheduler.timesteps
         self._num_timesteps = len(timesteps)
-        # 5. Resolve per-step guidance weights. A constant `guidance_scale` takes one path; otherwise use the
-        # `guidance_schedule`. Exactly one of the two is set (validated above).
         if guidance_scale is not None:
-            gw = torch.full((num_inference_steps,), float(guidance_scale), dtype=torch.float32, device=device)
-        else:
-            gw = torch.as_tensor(guidance_schedule, dtype=torch.float32, device=device)
-            if gw.shape != (num_inference_steps,):
-                raise ValueError(
-                    f"`guidance_schedule` must have shape ({num_inference_steps},), got {tuple(gw.shape)}"
-                )
         # 6. Prepare latents in the packed (B, num_image_tokens, latent_dim) layout.
         latent_dim = self.transformer.config.in_channels
         latents = self.prepare_latents(
-            batch_size=effective_batch_size,
             num_image_tokens=num_image_tokens,
             latent_dim=latent_dim,
             dtype=torch.float32,
@@ -519,27 +653,21 @@ class Ideogram4Pipeline(DiffusionPipeline):
             latents=latents,
         )
-        # 7. Pre-compute the inputs for the unconditional (image-only) branch.
         max_text_tokens = max_sequence_length
-        neg_position_ids = inputs["position_ids"][:, max_text_tokens:]
-        neg_segment_ids = inputs["segment_ids"][:, max_text_tokens:]
-        neg_indicator = inputs["indicator"][:, max_text_tokens:]
-        neg_llm_features = torch.zeros(
-            effective_batch_size,
-            num_image_tokens,
-            llm_features.shape[-1],
-            dtype=llm_features.dtype,
-            device=device,
-        )
         text_z_padding = torch.zeros(
-            effective_batch_size,
             max_text_tokens,
             latent_dim,
             dtype=torch.float32,
             device=device,
         )
         # 8. Denoising loop. The scheduler stores `num_train_timesteps`-scaled timesteps; convert back to model time.
         num_train_timesteps = self.scheduler.config.num_train_timesteps
         with self.progress_bar(total=num_inference_steps) as progress_bar:
@@ -549,36 +677,40 @@ class Ideogram4Pipeline(DiffusionPipeline):
                 # Map sigma-domain timestep to model time `t` in [0, 1] (0 = noise, 1 = clean data).
                 t_model = 1.0 - (t.float() / num_train_timesteps)
-                t_model = t_model.expand(effective_batch_size).to(self.transformer.dtype)
                 # Conditional pass operates on the full packed sequence.
-                pos_z = torch.cat([text_z_padding, latents], dim=1)
                 pos_out = self.transformer(
                     hidden_states=pos_z,
                     timestep=t_model,
                     encoder_hidden_states=llm_features,
-                    position_ids=inputs["position_ids"],
-                    segment_ids=inputs["segment_ids"],
-                    indicator=inputs["indicator"],
                     return_dict=False,
                 )[0]
-                pos_v = pos_out[:, max_text_tokens:]
                 # Unconditional pass uses image-only positions with zeroed text features.
                 neg_v = self.unconditional_transformer(
-                    hidden_states=latents,
                     timestep=t_model,
                     encoder_hidden_states=neg_llm_features,
                     position_ids=neg_position_ids,
                     segment_ids=neg_segment_ids,
                     indicator=neg_indicator,
                     return_dict=False,
-                )[0]
                 gw_i = gw[i]
                 v = gw_i * pos_v + (1.0 - gw_i) * neg_v
-                latents = self.scheduler.step(-v.to(torch.float32), t, latents, return_dict=False)[0]
                 if callback_on_step_end is not None:
                     callback_kwargs = {k: locals()[k] for k in callback_on_step_end_tensor_inputs}
@@ -587,11 +719,24 @@ class Ideogram4Pipeline(DiffusionPipeline):
                 progress_bar.update()
-        # 9. Decode.
         if output_type == "latent":
             image = latents
         else:
-            decoded = self._decode(latents, grid_h=grid_h, grid_w=grid_w)
             image = self.image_processor.postprocess(decoded.float(), output_type=output_type)
         self.maybe_free_model_hooks()

     Ideogram4Transformer2DModel,
 )
 from ...schedulers import FlowMatchEulerDiscreteScheduler
+from ...utils import is_outlines_available, logging, replace_example_docstring
 from ...utils.torch_utils import randn_tensor
 from ..pipeline_utils import DiffusionPipeline
 from .pipeline_output import Ideogram4PipelineOutput
+from .prompt_enhancer import CAPTION_SYSTEM_MESSAGE, CAPTION_USER_TEMPLATE, build_caption_logits_processor
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 # text conditioning consumed by the Ideogram4 transformer.
 QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
+# LM head grafted onto the (head-less) text encoder for optional prompt upsampling.
+DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
+PROMPT_UPSAMPLE_TEMPERATURE = 1.0
 EXAMPLE_DOC_STRING = """
     return base_mu + 0.5 * math.log(num_pixels / base_pixels)
+def _expand_tensor_to_effective_batch(
+    tensor: torch.Tensor,
+    batch_size: int,
+    num_per_prompt: int,
+    tensor_name: str | None = None,
+) -> torch.Tensor:
+    """Replicate `tensor` along dim 0 from `batch_size` (or 1) to `batch_size * num_per_prompt`."""
+    target_batch_size = batch_size * num_per_prompt
+    if tensor.shape[0] == target_batch_size:
+        return tensor
+    if tensor.shape[0] == 1:
+        repeat_by = target_batch_size
+    elif tensor.shape[0] == batch_size:
+        repeat_by = num_per_prompt
+    else:
+        tensor_name = f"`{tensor_name}`" if tensor_name is not None else "Tensor"
+        raise ValueError(
+            f"{tensor_name} batch size must be 1, `batch_size` ({batch_size}), or "
+            f"`batch_size * num_*_per_prompt` ({target_batch_size}), but got {tensor.shape[0]}."
+        )
+    return torch.repeat_interleave(tensor, repeats=repeat_by, dim=0, output_size=tensor.shape[0] * repeat_by)
 class Ideogram4Pipeline(DiffusionPipeline):
     r"""
     Text-to-image pipeline for Ideogram4.
         self.patch_size = 2
         self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * self.patch_size)
+        # Lazily built by `load_prompt_enhancer` for optional prompt upsampling.
+        self._caption_model = None
+        self._caption_logits_processor = None
+    def load_prompt_enhancer(
+        self,
+        lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
+        lm_head_filename: str = "lm_head.safetensors",
+        torch_dtype: torch.dtype | None = None,
+    ) -> PreTrainedModel:
+        """Make the frozen text encoder generative for prompt upsampling by grafting a hosted LM head.
+        The head is the only extra weight loaded; the encoder body is shared (no second model in memory).
+        Called automatically by `upsample_prompt` on first use. Generation is constrained to the caption JSON
+        schema when `outlines` is installed; otherwise it falls back to unconstrained decoding with a warning.
+        """
+        from accelerate import init_empty_weights
+        from huggingface_hub import hf_hub_download
+        from safetensors.torch import load_file
+        from transformers import Qwen3VLForConditionalGeneration
+        dtype = torch_dtype or self.text_encoder.dtype
+        head_weight = load_file(hf_hub_download(lm_head_repo_id, lm_head_filename))["lm_head.weight"].to(dtype)
+        with init_empty_weights():
+            caption_model = Qwen3VLForConditionalGeneration(self.text_encoder.config)
+        caption_model.model = self.text_encoder  # reuse the loaded encoder body
+        lm_head = torch.nn.Linear(head_weight.shape[1], head_weight.shape[0], bias=False)
+        with torch.no_grad():
+            lm_head.weight.copy_(head_weight)
+        caption_model.lm_head = lm_head.to(device=self.text_encoder.device, dtype=dtype)
+        caption_model.eval()
+        if is_outlines_available():
+            logits_processor = build_caption_logits_processor(caption_model, self.tokenizer)
+        else:
+            logits_processor = None
+            logger.warning(
+                "`outlines` is not installed; prompt upsampling will run unconstrained and may not return "
+                "schema-valid JSON. Install with `pip install outlines` for structured captions."
+            )
+        self._caption_model = caption_model
+        self._caption_logits_processor = logits_processor
+        return caption_model
+    def upsample_prompt(
+        self,
+        prompt: str | list[str],
+        height: int = 2048,
+        width: int = 2048,
+        max_new_tokens: int = 1024,
+        lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
+        device: torch.device | None = None,
+    ) -> list[str]:
+        """Rewrite each prompt into Ideogram4's native structured JSON caption via the grafted text encoder."""
+        if self._caption_model is None:
+            self.load_prompt_enhancer(lm_head_repo_id=lm_head_repo_id)
+        device = device or self._caption_model.device
+        prompts = [prompt] if isinstance(prompt, str) else list(prompt)
+        divisor = math.gcd(width, height) or 1
+        aspect_ratio = f"{width // divisor}:{height // divisor}"
+        captions = []
+        for text_prompt in prompts:
+            messages = [
+                {"role": "system", "content": CAPTION_SYSTEM_MESSAGE},
+                {
+                    "role": "user",
+                    "content": CAPTION_USER_TEMPLATE.format(aspect_ratio=aspect_ratio, original_prompt=text_prompt),
+                },
+            ]
+            inputs = self.tokenizer.apply_chat_template(
+                messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
+            ).to(device)
+            generate_kwargs = {
+                "max_new_tokens": max_new_tokens,
+                "do_sample": True,
+                "temperature": PROMPT_UPSAMPLE_TEMPERATURE,
+                "use_cache": True,
+            }
+            if self._caption_logits_processor is not None:
+                self._caption_logits_processor.reset()
+                generate_kwargs["logits_processor"] = [self._caption_logits_processor]
+            generated = self._caption_model.generate(**inputs, **generate_kwargs)
+            new_tokens = generated[:, inputs["input_ids"].shape[1] :]
+            captions.append(self.tokenizer.decode(new_tokens[0], skip_special_tokens=True).strip())
+        return captions
+    def _prepare_ids(
         self,
+        text_lengths: list[int],
+        grid_h: int,
+        grid_w: int,
         max_text_tokens: int,
         device: torch.device,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Build the packed `[left-pad][text][image]` layout from the per-prompt text lengths and the image grid.
+        Returns `position_ids` (3-axis MRoPE), `segment_ids` (block-diagonal attention) and `indicator` (per-token
+        text/image/pad role).
+        """
+        batch_size = len(text_lengths)
         num_image_tokens = grid_h * grid_w
         total_seq_len = max_text_tokens + num_image_tokens
         t_idx = torch.zeros_like(h_idx)
         image_pos = torch.stack([t_idx, h_idx, w_idx], dim=1) + IMAGE_POSITION_OFFSET
         position_ids = torch.zeros(batch_size, total_seq_len, 3, dtype=torch.long)
         segment_ids = torch.full((batch_size, total_seq_len), SEQUENCE_PADDING_INDICATOR, dtype=torch.long)
         indicator = torch.zeros(batch_size, total_seq_len, dtype=torch.long)
+        for b, num_text in enumerate(text_lengths):
+            offset = max_text_tokens - num_text
             text_pos = torch.arange(num_text)
             text_pos_3d = torch.stack([text_pos, text_pos, text_pos], dim=1)
             position_ids[b, offset : offset + num_text] = text_pos_3d
             position_ids[b, offset + num_text :] = image_pos
             segment_ids[b, offset : offset + num_text + num_image_tokens] = 1
+        return position_ids.to(device), segment_ids.to(device), indicator.to(device)
     def _get_text_encoder_hidden_states(
         self,
     def encode_prompt(
         self,
+        prompt: str | list[str],
+        grid_h: int,
+        grid_w: int,
+        max_sequence_length: int,
         device: torch.device,
+    ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Prepare the conditioning for the packed text+image sequence (one entry per prompt).
+        Returns a flat tuple `(prompt_embeds, position_ids, segment_ids, indicator)`. The unconditional branch carries
+        no text, so the pipeline builds its (zeroed) inputs directly rather than encoding a negative prompt.
+        """
+        prompts = [prompt] if isinstance(prompt, str) else list(prompt)
+        batch_size = len(prompts)
+        num_image_tokens = grid_h * grid_w
+        # Tokenize each chat-formatted prompt and left-pad to `max_sequence_length`. Only the text region is fed to
+        # the encoder: the packed image tokens come after the text and the encoder is causal, so they never affect it.
+        token_ids = torch.zeros(batch_size, max_sequence_length, dtype=torch.long)
+        attention_mask = torch.zeros(batch_size, max_sequence_length, dtype=torch.long)
+        text_position_ids = torch.zeros(batch_size, max_sequence_length, dtype=torch.long)
+        text_lengths = []
+        for b, text_prompt in enumerate(prompts):
+            messages = [{"role": "user", "content": [{"type": "text", "text": text_prompt}]}]
+            text = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
+            toks = self.tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
+            n = int(toks.shape[0])
+            if n > max_sequence_length:
+                raise ValueError(f"prompt has {n} tokens, exceeds max_sequence_length={max_sequence_length}")
+            text_lengths.append(n)
+            offset = max_sequence_length - n
+            token_ids[b, offset:] = toks
+            attention_mask[b, offset:] = 1
+            text_position_ids[b, offset:] = torch.arange(n)
+        token_ids = token_ids.to(device)
+        attention_mask = attention_mask.to(device)
+        text_position_ids = text_position_ids.to(device)
+        # Concatenate the tapped activation-layer hidden states into per-token text features, zeroing padding.
+        selected = self._get_text_encoder_hidden_states(token_ids, attention_mask, text_position_ids)
+        text_features = torch.stack(selected, dim=0).permute(1, 2, 3, 0).reshape(batch_size, max_sequence_length, -1)
+        text_features = (text_features * attention_mask.to(text_features.dtype).unsqueeze(-1)).to(torch.float32)
+        position_ids, segment_ids, indicator = self._prepare_ids(
+            text_lengths, grid_h, grid_w, max_sequence_length, device
+        )
+        # Pack the text features into the full sequence; image positions carry no text features.
+        image_feature_padding = torch.zeros(
+            batch_size, num_image_tokens, text_features.shape[-1], dtype=text_features.dtype, device=device
+        )
+        prompt_embeds = torch.cat([text_features, image_feature_padding], dim=1)
+        return prompt_embeds, position_ids, segment_ids, indicator
     def prepare_latents(
         self,
             latents = latents.to(device=device, dtype=dtype)
         return latents
     @property
     def guidance_scale(self) -> float | None:
         return self._guidance_scale
     def interrupt(self) -> bool:
         return self._interrupt
+    def check_inputs(
+        self,
+        prompt,
+        height,
+        width,
+        num_inference_steps,
+        guidance_scale,
+        guidance_schedule,
+        callback_on_step_end_tensor_inputs=None,
+    ):
+        if prompt is None:
+            raise ValueError("`prompt` must be provided.")
+        if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if (
+            height % (self.vae_scale_factor * self.patch_size) != 0
+            or width % (self.vae_scale_factor * self.patch_size) != 0
+        ):
+            raise ValueError(
+                f"`height` ({height}) and `width` ({width}) must both be divisible by {self.vae_scale_factor * self.patch_size} "
+                f"(vae_scale_factor * patch_size)."
+            )
+        # Guidance is controlled by either a constant `guidance_scale` or a per-step `guidance_schedule`; exactly
+        # one must be set (the `guidance_schedule` default makes the no-arg call use the recommended schedule).
+        if guidance_scale is not None and guidance_schedule is not None:
+            raise ValueError("Only one of `guidance_scale` and `guidance_schedule` may be set.")
+        if guidance_scale is None and guidance_schedule is None:
+            raise ValueError("One of `guidance_scale` and `guidance_schedule` must be set.")
+        if guidance_schedule is not None and len(guidance_schedule) != num_inference_steps:
+            raise ValueError(
+                f"`guidance_schedule` must have length `num_inference_steps` ({num_inference_steps}), "
+                f"got {len(guidance_schedule)}."
+            )
+        if callback_on_step_end_tensor_inputs is not None and not all(
+            k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found "
+                f"{[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
     @torch.no_grad()
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         prompt: str | list[str] | None = None,
         height: int = 2048,
         width: int = 2048,
+        num_inference_steps: int = 48,
         guidance_scale: float | None = None,
+        guidance_schedule: list[float] | torch.Tensor | None = (7.0,) * 45 + (3.0,) * 3,
+        mu: float = 0.0,
+        std: float = 1.5,
+        prompt_upsampling: bool = False,
         max_sequence_length: int = 2048,
         num_images_per_prompt: int = 1,
         generator: torch.Generator | list[torch.Generator] | None = None,
         output_type: str = "pil",
         return_dict: bool = True,
         callback_on_step_end: Callable[["Ideogram4Pipeline", int, int, dict[str, Any]], dict[str, Any]] | None = None,
+        callback_on_step_end_tensor_inputs: list[str] = ["latents"],
     ) -> Ideogram4PipelineOutput | tuple[Any]:
         r"""
         Run text-to-image generation.
                 velocity predictions are blended as `v = guidance_scale * v_pos + (1 - guidance_scale) * v_neg`.
                 Mutually exclusive with `guidance_schedule` (setting both raises). Defaults to `None`.
             guidance_schedule (`list[float]` or `torch.Tensor`, *optional*):
+                Per-step guidance scale schedule; must have length `num_inference_steps`. The first entry corresponds
+                to the first step (largest noise level). Mutually exclusive with `guidance_scale`; exactly one must be
+                set. Defaults to the recommended schedule (7.0 for the main steps, dropping to 3.0 for the final 3
+                "polish" steps). To use a constant scale instead, pass `guidance_scale` and `guidance_schedule=None`.
             mu (`float`, *optional*, defaults to 0.0):
                 Base mean of the logit-normal flow-matching schedule. The schedule mean is shifted by half the log of
                 the resolution ratio relative to 512x512.
             std (`float`, *optional*, defaults to 1.5):
                 Standard deviation of the logit-normal flow-matching schedule.
+            prompt_upsampling (`bool`, *optional*, defaults to `False`):
+                If `True`, rewrite `prompt` into Ideogram4's native structured JSON caption via
+                [`~Ideogram4Pipeline.upsample_prompt`] before encoding. Requires the prompt-enhancer LM head
+                (downloaded on first use); install `outlines` for schema-constrained captions.
             max_sequence_length (`int`, *optional*, defaults to 2048):
                 Maximum number of text tokens per prompt.
             num_images_per_prompt (`int`, *optional*, defaults to 1):
         Returns:
             [`~pipelines.ideogram4.Ideogram4PipelineOutput`] or `tuple`.
         """
+        self.check_inputs(
+            prompt=prompt,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            guidance_schedule=guidance_schedule,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+        )
+        if isinstance(prompt, str):
+            batch_size = 1
+        elif isinstance(prompt, list):
+            batch_size = len(prompt)
         device = self._execution_device
         self._guidance_scale = guidance_scale
         self._interrupt = False
+        # 0. Optionally rewrite the prompt(s) into Ideogram4's native structured JSON caption.
+        if prompt_upsampling:
+            prompt = self.upsample_prompt(prompt, height=height, width=width, device=device)
+        # 1. Image grid (drives both the packed layout and the latent shape).
+        grid_h, grid_w = (
+            height // (self.vae_scale_factor * self.patch_size),
+            width // (self.vae_scale_factor * self.patch_size),
         )
+        num_image_tokens = grid_h * grid_w
+        # 2. Encode prompts into the packed conditioning (one entry per prompt).
+        llm_features, position_ids, segment_ids, indicator = self.encode_prompt(
+            prompt=prompt,
+            grid_h=grid_h,
+            grid_w=grid_w,
+            max_sequence_length=max_sequence_length,
             device=device,
         )
+        # 3. Replicate the conditioning for num_images_per_prompt.
+        llm_features = _expand_tensor_to_effective_batch(llm_features, batch_size, num_images_per_prompt)
+        position_ids = _expand_tensor_to_effective_batch(position_ids, batch_size, num_images_per_prompt)
+        segment_ids = _expand_tensor_to_effective_batch(segment_ids, batch_size, num_images_per_prompt)
+        indicator = _expand_tensor_to_effective_batch(indicator, batch_size, num_images_per_prompt)
+        # 4. Unconditional (image-only) branch, derived from the conditioning: zeroed text features and the
+        # image-region slices of the layout.
+        neg_llm_features = torch.zeros(
+            batch_size * num_images_per_prompt,
+            num_image_tokens,
+            llm_features.shape[-1],
+            dtype=llm_features.dtype,
+            device=device,
+        )
+        neg_position_ids = position_ids[:, max_sequence_length:]
+        neg_segment_ids = segment_ids[:, max_sequence_length:]
+        neg_indicator = indicator[:, max_sequence_length:]
         # 4. Set up the resolution-aware logit-normal schedule on the scheduler.
         schedule_mu = _resolution_aware_mu(height=height, width=width, base_mu=mu)
         timesteps = self.scheduler.timesteps
         self._num_timesteps = len(timesteps)
+        # 5. Resolve the per-step guidance schedule (a constant `guidance_scale` broadcasts to every step, otherwise
+        # use the provided `guidance_schedule`, validated by `check_inputs`) and the tensor of per-step weights `gw`.
         if guidance_scale is not None:
+            guidance_schedule = [float(guidance_scale)] * num_inference_steps
+        gw = torch.as_tensor(guidance_schedule, dtype=torch.float32, device=device)
         # 6. Prepare latents in the packed (B, num_image_tokens, latent_dim) layout.
         latent_dim = self.transformer.config.in_channels
         latents = self.prepare_latents(
+            batch_size=batch_size * num_images_per_prompt,
             num_image_tokens=num_image_tokens,
             latent_dim=latent_dim,
             dtype=torch.float32,
             latents=latents,
         )
+        # 7. Padding for the text region of the conditional packed sequence (image latents are appended after it).
         max_text_tokens = max_sequence_length
         text_z_padding = torch.zeros(
+            batch_size * num_images_per_prompt,
             max_text_tokens,
             latent_dim,
             dtype=torch.float32,
             device=device,
         )
+        # The transformers run in their loaded compute dtype; cast the (otherwise float32) text features to match.
+        # `latents` stay float32 for scheduler precision and are cast per-step at the transformer call below.
+        llm_features = llm_features.to(self.transformer.dtype)
+        neg_llm_features = neg_llm_features.to(self.unconditional_transformer.dtype)
         # 8. Denoising loop. The scheduler stores `num_train_timesteps`-scaled timesteps; convert back to model time.
         num_train_timesteps = self.scheduler.config.num_train_timesteps
         with self.progress_bar(total=num_inference_steps) as progress_bar:
                 # Map sigma-domain timestep to model time `t` in [0, 1] (0 = noise, 1 = clean data).
                 t_model = 1.0 - (t.float() / num_train_timesteps)
+                t_model = t_model.expand(batch_size * num_images_per_prompt).to(self.transformer.dtype)
                 # Conditional pass operates on the full packed sequence.
+                pos_z = torch.cat([text_z_padding, latents], dim=1).to(self.transformer.dtype)
                 pos_out = self.transformer(
                     hidden_states=pos_z,
                     timestep=t_model,
                     encoder_hidden_states=llm_features,
+                    position_ids=position_ids,
+                    segment_ids=segment_ids,
+                    indicator=indicator,
                     return_dict=False,
                 )[0]
+                # Velocity (and guidance) is computed in float32 for scheduler precision; the transformers
+                # return their compute dtype, so cast the predicted velocities up here.
+                pos_v = pos_out[:, max_text_tokens:].to(torch.float32)
                 # Unconditional pass uses image-only positions with zeroed text features.
                 neg_v = self.unconditional_transformer(
+                    hidden_states=latents.to(self.unconditional_transformer.dtype),
                     timestep=t_model,
                     encoder_hidden_states=neg_llm_features,
                     position_ids=neg_position_ids,
                     segment_ids=neg_segment_ids,
                     indicator=neg_indicator,
                     return_dict=False,
+                )[0].to(torch.float32)
+                # Expose the current step's guidance weight via `self.guidance_scale` so callbacks can read it.
+                self._guidance_scale = guidance_schedule[i]
                 gw_i = gw[i]
                 v = gw_i * pos_v + (1.0 - gw_i) * neg_v
+                latents = self.scheduler.step(-v, t, latents, return_dict=False)[0]
                 if callback_on_step_end is not None:
                     callback_kwargs = {k: locals()[k] for k in callback_on_step_end_tensor_inputs}
                 progress_bar.update()
+        # 9. Decode: unpatch the latents, denormalize with the VAE batch-norm stats, and decode through the VAE.
         if output_type == "latent":
             image = latents
         else:
+            z = latents
+            # VAE bn stores per-channel statistics on the packed-channel latent space (ae_channels * patch ** 2).
+            bn_mean = self.vae.bn.running_mean.view(1, 1, -1).to(device=z.device, dtype=z.dtype)
+            bn_std = torch.sqrt(self.vae.bn.running_var + self.vae.config.batch_norm_eps).view(1, 1, -1)
+            bn_std = bn_std.to(device=z.device, dtype=z.dtype)
+            z = z * bn_std + bn_mean
+            patch = self.patch_size
+            ae_channels = z.shape[-1] // (patch * patch)
+            z = z.view(batch_size * num_images_per_prompt, grid_h, grid_w, patch, patch, ae_channels)
+            z = z.permute(0, 5, 1, 3, 2, 4).contiguous()
+            z = z.view(batch_size * num_images_per_prompt, ae_channels, grid_h * patch, grid_w * patch)
+            decoded = self.vae.decode(z.to(self.vae.dtype), return_dict=False)[0]
             image = self.image_processor.postprocess(decoded.float(), output_type=output_type)
         self.maybe_free_model_hooks()

diffusers_src/src/diffusers/pipelines/ideogram4/prompt_enhancer.py ADDED Viewed

	@@ -0,0 +1,109 @@

+# Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Prompt-enhancement assets for Ideogram4.
+Ideogram4 is trained on a *structured JSON caption* rather than a free-form prompt. The optional prompt
+enhancer rewrites a short user idea into that native caption schema, using the pipeline's own (frozen)
+Qwen3-VL text encoder grafted with a generative head (see `Ideogram4Pipeline.load_prompt_enhancer`).
+This mirrors the role of Flux2's `system_messages.py`, but the target is a constrained JSON object instead of
+free text, so `outlines` (an optional dependency) is used to guarantee a schema-valid result when available.
+"""
+# System message that instructs the encoder to emit Ideogram4's native single-line JSON caption.
+CAPTION_SYSTEM_MESSAGE = """You convert a short user idea into a structured JSON caption for an image renderer. Output ONE minified single-line JSON object and NOTHING else (no markdown, no commentary).
+SCHEMA — keys in this exact order:
+{"high_level_description":"...","compositional_deconstruction":{"background":"...","elements":[ ... ]}}
+- object element: {"type":"obj","desc":"..."}
+- text element:   {"type":"text","text":"VERBATIM CHARS","desc":"..."}
+STEP 1 — PICK THE MEDIUM. It decides what `background` and `elements` mean. Honor any medium or style the user implies; default to photograph only when nothing else fits. Render ANY subject faithfully — real, fantastical, sci-fi, surreal, abstract — in the chosen medium.
+A) DESIGNED ARTIFACT — poster, logo, album/book cover, flyer, banner, sticker, packaging, app icon, infographic, menu, card, wordmark. THE FRAME IS THE ARTIFACT — never a photo of it hanging in a room.
+   - high_level_description: name it as graphic design (e.g. "a minimalist jazz poster, flat graphic design...").
+   - background: the design's OWN backdrop only — a flat color, gradient, or simple texture filling the frame. No room, wall, floor, easel, depth, or camera/photo language.
+   - elements: the design's parts as a flat 2D layout — a `text` element for every headline/label (verbatim), `obj` elements for the central graphic/illustration/shapes/badges. Place by region (top / center / bottom).
+B) SCENE — a photograph, illustration, painting, 3D render, anime frame, etc. of a real or imagined place or subject.
+   - high_level_description: one sentence naming the subject and the medium/style.
+   - background: the scene SHELL — surroundings, ground/sky/walls, atmosphere, ambient light; concrete and specific. The ground/floor/water surface lives here, never as an element.
+   - elements: the main subject FIRST as an `obj`, then supporting `obj` elements (props, secondary subjects) that plausibly belong. Add `text` elements only where the scene would really carry text (signs, labels, brands).
+C) ABSTRACT / CONCEPTUAL — "nostalgia", "chaos and order", "sound waves", pure pattern. Concretize the idea into a deliberate visual composition.
+   - background: the dominant color field, gradient, or texture of the composition.
+   - elements: the shapes, forms, motifs, or symbolic objects that carry the concept, as `obj` elements. Add `text` only if the idea calls for words.
+UNIVERSAL RULES (every medium):
+1. The user's core subject/concept MUST appear among the elements (as an `obj`, normally first). Naming it only in high_level_description or background is NOT enough.
+2. Commit to ONE concrete value each (one color, one style, one count). No hedging: ban "various", "such as", "e.g.", "or similar", "maybe", "X or Y" for one property.
+3. NEVER use a transparent, empty, or plain white background UNLESS the user explicitly says "transparent", "isolated", "sticker", or "cutout".
+4. A coherent subject (one animal, person, vehicle, object) is exactly ONE element; its parts go inside its `desc`. Use separate elements for genuinely separate subjects.
+5. Each `desc` is 25-55 words, identity-first, standalone. Do not mention shadows, depth of field, bokeh, lens, focus, or grain.
+6. high_level_description: one sentence, at most 40 words, starts with the subject, names the medium. Preserve non-ASCII characters as-is.
+7. Output STRICTLY VALID JSON: double quotes around every key and string, NO trailing commas, each element object closes with "}" right after its last value.
+8. Catch the "warm" impulse. Only when you are about to describe light as "warm", "golden", "amber", or "honey", stop and check: is there a specific physical source in the scene casting that colour (candle, sunset, lamp, neon, fire)? If YES, name the source and the colour it casts instead of the mood word. If NO, you are just reaching for warmth as ambience — drop it and leave the light neutral ("soft" or "even"). Don't recolour or relight anything else; this only intercepts the warm reach, every other scene and mood the user wants is untouched.
+9. Describe physical reality, not impressions. Avoid mood-words — "luminous", "radiant", "vibrant", "lush", "dynamic", "gorgeous", "stunning", "breathtaking", "mesmerizing", and metaphorical "glowing" — they produce a generic AI look (the same trap as "warm"). Use observable properties: "the cheekbone catches a small highlight", not "luminous complexion".
+10. Every named thing must appear as its own element. Each subject, object, sign, and quoted phrase the user names gets its own element — quoted text (single or double quotes) becomes its own verbatim `text` element. Count the named units in the prompt; the element list must hold at least that many. Don't drop or merge them.
+11. Don't add what wasn't asked for. No glitch art, wireframe overlay, body fragmentation, double-exposure, "dissolving", or extra stylization unless the prompt requests it. Asked for a cinematic photo of a journalist → render that, not a glitch-art composite.
+12. Name attributes concretely, anchored to landmarks. People: skin tone, hair (colour + style), each visible garment with colour, expression, pose, one distinguishing feature. Objects: shape, material, colour, a distinctive part. Place things against named references — "resting on the lower-right corner of the table", not "on the surface".
+13. Name real references by name. If the user names a brand, product, character, place, or person (Nike Dunk Low, Spider-Man, the Eiffel Tower), keep that exact name in the `desc`; don't swap it for a generic look-alike unless they ask for an anonymous one.
+14. "Professional photo/headshot" of a person means professional CONTEXT — neutral attire, soft even daylight, neutral backdrop, friendly expression — not dramatic studio gear; no heavy rim-light or creamy bokeh unless asked.
+EXAMPLES
+User idea: a cup of coffee on a table
+Output: {"high_level_description":"A white ceramic cup of black coffee on a worn wooden cafe table, a casual overcast-daylight phone photograph with an off-center composition.","compositional_deconstruction":{"background":"Scratched oak cafe table filling the lower frame, a pale grey mortar-lined brick wall a few feet behind slightly out of focus, a tall window on the left spilling soft overcast daylight across the table, neutral white balance, muted brown and green tones.","elements":[{"type":"obj","desc":"White ceramic cup of black coffee with a thin curved handle turned to the right and a faint crema ring at the rim, resting on a matching round saucer near the center of the table, a thin wisp of steam at the surface."},{"type":"obj","desc":"Brushed-steel teaspoon lying on the saucer to the right of the cup, handle angled toward the lower-right corner, a single small water droplet on the bowl of the spoon."}]}}
+User idea: a minimalist poster for a jazz festival
+Output: {"high_level_description":"A minimalist jazz festival poster, flat graphic design with bold typography and a single abstract saxophone motif on a deep teal background.","compositional_deconstruction":{"background":"Solid deep teal background filling the entire frame with a subtle fine paper-grain texture and a thin mustard-yellow keyline border just inside the edges, no scene and no depth.","elements":[{"type":"obj","desc":"A large flat geometric saxophone in mustard yellow and cream, centered in the upper two-thirds, built from simple bold shapes with no shading, angled diagonally from lower-left to upper-right."},{"type":"text","text":"JAZZ\\nFESTIVAL","desc":"Large bold condensed sans-serif headline in cream, stacked on two lines across the center of the poster, slightly overlapping the saxophone motif."},{"type":"text","text":"NOV 15 · CITY HALL","desc":"Small uppercase mustard-yellow caption centered near the bottom edge with wide letter spacing."}]}}"""
+# User turn. `{aspect_ratio}` and `{original_prompt}` are filled in by `Ideogram4Pipeline.upsample_prompt`.
+CAPTION_USER_TEMPLATE = """TARGET IMAGE ASPECT RATIO: {aspect_ratio} (width:height).
+User idea: {original_prompt}"""
+def build_caption_logits_processor(model, tokenizer):
+    """Build an `outlines` logits processor that constrains generation to the Ideogram4 caption schema.
+    Returns a logits processor compatible with `transformers` `generate(logits_processor=[...])`. The caller is
+    responsible for checking `is_outlines_available()` first; `outlines` (and its `pydantic` dependency) are
+    imported lazily here so they remain optional. The schema mirrors Ideogram's native caption /
+    caption_verifier: a high-level description plus a compositional deconstruction of background + typed elements.
+    """
+    from typing import List, Literal, Union
+    import outlines
+    from pydantic import BaseModel, Field
+    class ObjElement(BaseModel):
+        type: Literal["obj"]
+        desc: str
+    class TextElement(BaseModel):
+        type: Literal["text"]
+        text: str
+        desc: str
+    class Composition(BaseModel):
+        background: str
+        elements: List[Union[ObjElement, TextElement]] = Field(min_length=1)
+    class Caption(BaseModel):
+        high_level_description: str
+        compositional_deconstruction: Composition
+    outlines_model = outlines.from_transformers(model, tokenizer)
+    return outlines.Generator(outlines_model, Caption).logits_processor

diffusers_src/src/diffusers/quantizers/bitsandbytes/bnb_quantizer.py CHANGED Viewed

@@ -206,12 +206,10 @@ class BnB4BitDiffusersQuantizer(DiffusersQuantizer):
         module._parameters[tensor_name] = new_value
     def check_quantized_param_shape(self, param_name, current_param, loaded_param):
-        import math
         current_param_shape = current_param.shape
         loaded_param_shape = loaded_param.shape
-        n = math.prod(current_param_shape)
         inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1)
         if loaded_param_shape != inferred_shape:
             raise ValueError(

         module._parameters[tensor_name] = new_value
     def check_quantized_param_shape(self, param_name, current_param, loaded_param):
         current_param_shape = current_param.shape
         loaded_param_shape = loaded_param.shape
+        n = current_param_shape.numel()
         inferred_shape = (n,) if "bias" in param_name else ((n + 1) // 2, 1)
         if loaded_param_shape != inferred_shape:
             raise ValueError(

diffusers_src/src/diffusers/utils/__init__.py CHANGED Viewed

@@ -101,6 +101,7 @@ from .import_utils import (
     is_opencv_available,
     is_optimum_quanto_available,
     is_optimum_quanto_version,
     is_peft_available,
     is_peft_version,
     is_pytorch_retinaface_available,

     is_opencv_available,
     is_optimum_quanto_available,
     is_optimum_quanto_version,
+    is_outlines_available,
     is_peft_available,
     is_peft_version,
     is_pytorch_retinaface_available,

diffusers_src/src/diffusers/utils/import_utils.py CHANGED Viewed

@@ -204,6 +204,7 @@ _wandb_available, _wandb_version = _is_package_available("wandb")
 _tensorboard_available, _tensorboard_version = _is_package_available("tensorboard")
 _compel_available, _compel_version = _is_package_available("compel")
 _sentencepiece_available, _sentencepiece_version = _is_package_available("sentencepiece")
 _torchsde_available, _torchsde_version = _is_package_available("torchsde")
 _peft_available, _peft_version = _is_package_available("peft")
 _torchvision_available, _torchvision_version = _is_package_available("torchvision")
@@ -370,6 +371,10 @@ def is_sentencepiece_available():
     return _sentencepiece_available
 def is_imageio_available():
     return _imageio_available

 _tensorboard_available, _tensorboard_version = _is_package_available("tensorboard")
 _compel_available, _compel_version = _is_package_available("compel")
 _sentencepiece_available, _sentencepiece_version = _is_package_available("sentencepiece")
+_outlines_available, _outlines_version = _is_package_available("outlines")
 _torchsde_available, _torchsde_version = _is_package_available("torchsde")
 _peft_available, _peft_version = _is_package_available("peft")
 _torchvision_available, _torchvision_version = _is_package_available("torchvision")
     return _sentencepiece_available
+def is_outlines_available():
+    return _outlines_available
 def is_imageio_available():
     return _imageio_available