multimodalart HF Staff commited on
Commit
a1f24dc
·
verified ·
1 Parent(s): 4c728e2

Bundle exactly PR #6 (df4eb9b) as the diffusers source

Browse files
diffusers_src/src/diffusers/__init__.py CHANGED
@@ -476,6 +476,8 @@ else:
476
  "HeliosPyramidModularPipeline",
477
  "HunyuanVideo15AutoBlocks",
478
  "HunyuanVideo15ModularPipeline",
 
 
479
  "LTXAutoBlocks",
480
  "LTXModularPipeline",
481
  "QwenImageAutoBlocks",
@@ -1297,6 +1299,8 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
1297
  HeliosPyramidModularPipeline,
1298
  HunyuanVideo15AutoBlocks,
1299
  HunyuanVideo15ModularPipeline,
 
 
1300
  LTXAutoBlocks,
1301
  LTXModularPipeline,
1302
  QwenImageAutoBlocks,
 
476
  "HeliosPyramidModularPipeline",
477
  "HunyuanVideo15AutoBlocks",
478
  "HunyuanVideo15ModularPipeline",
479
+ "Ideogram4AutoBlocks",
480
+ "Ideogram4ModularPipeline",
481
  "LTXAutoBlocks",
482
  "LTXModularPipeline",
483
  "QwenImageAutoBlocks",
 
1299
  HeliosPyramidModularPipeline,
1300
  HunyuanVideo15AutoBlocks,
1301
  HunyuanVideo15ModularPipeline,
1302
+ Ideogram4AutoBlocks,
1303
+ Ideogram4ModularPipeline,
1304
  LTXAutoBlocks,
1305
  LTXModularPipeline,
1306
  QwenImageAutoBlocks,
diffusers_src/src/diffusers/modular_pipelines/__init__.py CHANGED
@@ -79,6 +79,10 @@ else:
79
  "Flux2KleinModularPipeline",
80
  "Flux2KleinBaseModularPipeline",
81
  ]
 
 
 
 
82
  _import_structure["qwenimage"] = [
83
  "QwenImageAutoBlocks",
84
  "QwenImageModularPipeline",
@@ -142,6 +146,10 @@ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
142
  HunyuanVideo15AutoBlocks,
143
  HunyuanVideo15ModularPipeline,
144
  )
 
 
 
 
145
  from .ltx import LTXAutoBlocks, LTXModularPipeline
146
  from .modular_pipeline import (
147
  AutoPipelineBlocks,
 
79
  "Flux2KleinModularPipeline",
80
  "Flux2KleinBaseModularPipeline",
81
  ]
82
+ _import_structure["ideogram4"] = [
83
+ "Ideogram4AutoBlocks",
84
+ "Ideogram4ModularPipeline",
85
+ ]
86
  _import_structure["qwenimage"] = [
87
  "QwenImageAutoBlocks",
88
  "QwenImageModularPipeline",
 
146
  HunyuanVideo15AutoBlocks,
147
  HunyuanVideo15ModularPipeline,
148
  )
149
+ from .ideogram4 import (
150
+ Ideogram4AutoBlocks,
151
+ Ideogram4ModularPipeline,
152
+ )
153
  from .ltx import LTXAutoBlocks, LTXModularPipeline
154
  from .modular_pipeline import (
155
  AutoPipelineBlocks,
diffusers_src/src/diffusers/modular_pipelines/ideogram4/__init__.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import TYPE_CHECKING
2
+
3
+ from ...utils import (
4
+ DIFFUSERS_SLOW_IMPORT,
5
+ OptionalDependencyNotAvailable,
6
+ _LazyModule,
7
+ get_objects_from_module,
8
+ is_torch_available,
9
+ is_transformers_available,
10
+ )
11
+
12
+
13
+ _dummy_objects = {}
14
+ _import_structure = {}
15
+
16
+ try:
17
+ if not (is_transformers_available() and is_torch_available()):
18
+ raise OptionalDependencyNotAvailable()
19
+ except OptionalDependencyNotAvailable:
20
+ from ...utils import dummy_torch_and_transformers_objects # noqa F403
21
+
22
+ _dummy_objects.update(get_objects_from_module(dummy_torch_and_transformers_objects))
23
+ else:
24
+ _import_structure["modular_blocks_ideogram4"] = ["Ideogram4AutoBlocks"]
25
+ _import_structure["modular_pipeline"] = ["Ideogram4ModularPipeline"]
26
+
27
+ if TYPE_CHECKING or DIFFUSERS_SLOW_IMPORT:
28
+ try:
29
+ if not (is_transformers_available() and is_torch_available()):
30
+ raise OptionalDependencyNotAvailable()
31
+ except OptionalDependencyNotAvailable:
32
+ from ...utils.dummy_torch_and_transformers_objects import * # noqa F403
33
+ else:
34
+ from .modular_blocks_ideogram4 import Ideogram4AutoBlocks
35
+ from .modular_pipeline import Ideogram4ModularPipeline
36
+ else:
37
+ import sys
38
+
39
+ sys.modules[__name__] = _LazyModule(
40
+ __name__,
41
+ globals()["__file__"],
42
+ _import_structure,
43
+ module_spec=__spec__,
44
+ )
45
+
46
+ for name, value in _dummy_objects.items():
47
+ setattr(sys.modules[__name__], name, value)
diffusers_src/src/diffusers/modular_pipelines/ideogram4/before_denoise.py ADDED
@@ -0,0 +1,558 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import math
17
+
18
+ import torch
19
+
20
+ from ...models.transformers.transformer_ideogram4 import (
21
+ IMAGE_POSITION_OFFSET,
22
+ LLM_TOKEN_INDICATOR,
23
+ OUTPUT_IMAGE_INDICATOR,
24
+ SEQUENCE_PADDING_INDICATOR,
25
+ Ideogram4Transformer2DModel,
26
+ )
27
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
28
+ from ...utils import logging
29
+ from ...utils.torch_utils import randn_tensor
30
+ from ..modular_pipeline import ModularPipelineBlocks, PipelineState
31
+ from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
32
+ from .modular_pipeline import Ideogram4ModularPipeline
33
+
34
+
35
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
36
+
37
+ # Default per-step guidance schedule (length must equal `num_inference_steps`): 7.0 for the main steps,
38
+ # dropping to 3.0 for the final 3 "polish" steps.
39
+ DEFAULT_GUIDANCE_SCHEDULE = (7.0,) * 45 + (3.0,) * 3
40
+
41
+
42
+ # Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4._logit_normal_sigmas
43
+ def _logit_normal_sigmas(
44
+ num_inference_steps: int,
45
+ mu: float,
46
+ std: float = 1.0,
47
+ logsnr_min: float = -15.0,
48
+ logsnr_max: float = 18.0,
49
+ device: torch.device | None = None,
50
+ ) -> torch.Tensor:
51
+ r"""
52
+ Build a length-`num_inference_steps` sigma schedule using the Ideogram4 logit-normal flow-matching schedule.
53
+
54
+ Sigmas are returned in `[0, 1]` in decreasing order (sigma close to 1 corresponds to pure noise, sigma close to 0
55
+ to clean data), matching diffusers conventions.
56
+
57
+ The Ideogram4 schedule applies `sigma(s) = 1 - logit_normal_cdf_inverse(1 - s)` to `s = linspace(0, 1, N + 1)` and
58
+ keeps the first `N` entries; a terminal zero is appended downstream by the scheduler.
59
+ """
60
+ intervals = torch.linspace(0.0, 1.0, num_inference_steps + 1, dtype=torch.float64)
61
+ # Apply the inverse CDF of a normal then push through the logistic to obtain a logit-normal CDF inverse.
62
+ z = torch.special.ndtri(intervals)
63
+ y = mu + std * z
64
+ t = 1.0 - torch.special.expit(y)
65
+ t_min = 1.0 / (1.0 + math.exp(0.5 * logsnr_max))
66
+ t_max = 1.0 / (1.0 + math.exp(0.5 * logsnr_min))
67
+ t = t.clamp(t_min, t_max)
68
+ # Convert from model time (0 = noise, 1 = data) to diffusers sigma (1 = noise, 0 = data) and reverse.
69
+ sigmas = (1.0 - t).flip(0)
70
+ # Drop the trailing 0; FlowMatchEulerDiscreteScheduler.set_timesteps appends one back internally.
71
+ sigmas = sigmas[:-1].to(dtype=torch.float32, device=device)
72
+ return sigmas
73
+
74
+
75
+ # Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4._resolution_aware_mu
76
+ def _resolution_aware_mu(
77
+ height: int,
78
+ width: int,
79
+ base_mu: float,
80
+ base_resolution: tuple[int, int] = (512, 512),
81
+ ) -> float:
82
+ """Shift the schedule mean as a function of image resolution."""
83
+ num_pixels = height * width
84
+ base_pixels = base_resolution[0] * base_resolution[1]
85
+ return base_mu + 0.5 * math.log(num_pixels / base_pixels)
86
+
87
+
88
+ # Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4._expand_tensor_to_effective_batch
89
+ def _expand_tensor_to_effective_batch(
90
+ tensor: torch.Tensor,
91
+ batch_size: int,
92
+ num_per_prompt: int,
93
+ tensor_name: str | None = None,
94
+ ) -> torch.Tensor:
95
+ """Replicate `tensor` along dim 0 from `batch_size` (or 1) to `batch_size * num_per_prompt`."""
96
+ target_batch_size = batch_size * num_per_prompt
97
+
98
+ if tensor.shape[0] == target_batch_size:
99
+ return tensor
100
+
101
+ if tensor.shape[0] == 1:
102
+ repeat_by = target_batch_size
103
+ elif tensor.shape[0] == batch_size:
104
+ repeat_by = num_per_prompt
105
+ else:
106
+ tensor_name = f"`{tensor_name}`" if tensor_name is not None else "Tensor"
107
+ raise ValueError(
108
+ f"{tensor_name} batch size must be 1, `batch_size` ({batch_size}), or "
109
+ f"`batch_size * num_*_per_prompt` ({target_batch_size}), but got {tensor.shape[0]}."
110
+ )
111
+
112
+ return torch.repeat_interleave(tensor, repeats=repeat_by, dim=0, output_size=tensor.shape[0] * repeat_by)
113
+
114
+
115
+ # auto_docstring
116
+ class Ideogram4TextInputsStep(ModularPipelineBlocks):
117
+ """
118
+ Input step that determines `batch_size`/`dtype` from the per-prompt `text_features` and replicates the text outputs
119
+ to `batch_size * num_images_per_prompt`. Place after the text encoder.
120
+
121
+ Inputs:
122
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
123
+ The number of images to generate per prompt.
124
+ text_features (`Tensor`):
125
+ Per-prompt text features from the encoder.
126
+ text_lengths (`list`):
127
+ Per-prompt text-token counts from the encoder.
128
+
129
+ Outputs:
130
+ batch_size (`int`):
131
+ Effective batch size (num prompts * num_images_per_prompt).
132
+ dtype (`dtype`):
133
+ The dtype of the text features.
134
+ text_features (`Tensor`):
135
+ Text features, batch-expanded.
136
+ text_lengths (`list`):
137
+ Text-token counts, batch-expanded.
138
+ """
139
+
140
+ model_name = "ideogram4"
141
+
142
+ @property
143
+ def description(self) -> str:
144
+ return (
145
+ "Input step that determines `batch_size`/`dtype` from the per-prompt `text_features` and replicates the "
146
+ "text outputs to `batch_size * num_images_per_prompt`. Place after the text encoder."
147
+ )
148
+
149
+ @property
150
+ def inputs(self) -> list[InputParam]:
151
+ return [
152
+ InputParam.template("num_images_per_prompt", default=1),
153
+ InputParam(
154
+ name="text_features",
155
+ required=True,
156
+ type_hint=torch.Tensor,
157
+ description="Per-prompt text features from the encoder.",
158
+ ),
159
+ InputParam(
160
+ name="text_lengths",
161
+ required=True,
162
+ type_hint=list,
163
+ description="Per-prompt text-token counts from the encoder.",
164
+ ),
165
+ ]
166
+
167
+ @property
168
+ def intermediate_outputs(self) -> list[OutputParam]:
169
+ return [
170
+ OutputParam(
171
+ name="batch_size",
172
+ type_hint=int,
173
+ description="Effective batch size (num prompts * num_images_per_prompt).",
174
+ ),
175
+ OutputParam(name="dtype", type_hint=torch.dtype, description="The dtype of the text features."),
176
+ OutputParam(name="text_features", type_hint=torch.Tensor, description="Text features, batch-expanded."),
177
+ OutputParam(name="text_lengths", type_hint=list, description="Text-token counts, batch-expanded."),
178
+ ]
179
+
180
+ @torch.no_grad()
181
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
182
+ block_state = self.get_block_state(state)
183
+
184
+ prompt_batch = block_state.text_features.shape[0]
185
+ num_per_prompt = block_state.num_images_per_prompt
186
+
187
+ block_state.dtype = block_state.text_features.dtype
188
+ block_state.text_features = _expand_tensor_to_effective_batch(
189
+ block_state.text_features, prompt_batch, num_per_prompt, "text_features"
190
+ )
191
+ block_state.text_lengths = [n for n in block_state.text_lengths for _ in range(num_per_prompt)]
192
+ block_state.batch_size = prompt_batch * num_per_prompt
193
+
194
+ self.set_block_state(state, block_state)
195
+ return components, state
196
+
197
+
198
+ # auto_docstring
199
+ class Ideogram4PrepareLatentsStep(ModularPipelineBlocks):
200
+ """
201
+ Step that prepares the packed image latents (B, num_image_tokens, latent_dim) for the denoising loop.
202
+
203
+ Components:
204
+ transformer (`Ideogram4Transformer2DModel`)
205
+
206
+ Inputs:
207
+ latents (`Tensor`, *optional*):
208
+ Pre-generated noisy latents for image generation.
209
+ height (`int`):
210
+ The height in pixels of the generated image.
211
+ width (`int`):
212
+ The width in pixels of the generated image.
213
+ generator (`Generator`, *optional*):
214
+ Torch generator for deterministic generation.
215
+ batch_size (`int`):
216
+ Effective batch size.
217
+
218
+ Outputs:
219
+ latents (`Tensor`):
220
+ The initial packed image latents (B, num_image_tokens, latent_dim).
221
+ num_image_tokens (`int`):
222
+ Number of image tokens (grid_h * grid_w).
223
+ """
224
+
225
+ model_name = "ideogram4"
226
+
227
+ @property
228
+ def description(self) -> str:
229
+ return "Step that prepares the packed image latents (B, num_image_tokens, latent_dim) for the denoising loop."
230
+
231
+ @property
232
+ def expected_components(self) -> list[ComponentSpec]:
233
+ return [ComponentSpec("transformer", Ideogram4Transformer2DModel)]
234
+
235
+ @property
236
+ def inputs(self) -> list[InputParam]:
237
+ return [
238
+ InputParam.template("latents"),
239
+ InputParam.template("height", required=True),
240
+ InputParam.template("width", required=True),
241
+ InputParam.template("generator"),
242
+ InputParam(name="batch_size", required=True, type_hint=int, description="Effective batch size."),
243
+ ]
244
+
245
+ @property
246
+ def intermediate_outputs(self) -> list[OutputParam]:
247
+ return [
248
+ OutputParam(
249
+ name="latents",
250
+ type_hint=torch.Tensor,
251
+ description="The initial packed image latents (B, num_image_tokens, latent_dim).",
252
+ ),
253
+ OutputParam(
254
+ name="num_image_tokens", type_hint=int, description="Number of image tokens (grid_h * grid_w)."
255
+ ),
256
+ ]
257
+
258
+ @torch.no_grad()
259
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
260
+ block_state = self.get_block_state(state)
261
+
262
+ device = components._execution_device
263
+ patch = components.patch_size
264
+ grid_h = block_state.height // (components.vae_scale_factor * patch)
265
+ grid_w = block_state.width // (components.vae_scale_factor * patch)
266
+ num_image_tokens = grid_h * grid_w
267
+ latent_dim = components.transformer.config.in_channels
268
+
269
+ shape = (block_state.batch_size, num_image_tokens, latent_dim)
270
+ if block_state.latents is None:
271
+ block_state.latents = randn_tensor(
272
+ shape, generator=block_state.generator, device=device, dtype=torch.float32
273
+ )
274
+ else:
275
+ block_state.latents = block_state.latents.to(device=device, dtype=torch.float32)
276
+
277
+ block_state.num_image_tokens = num_image_tokens
278
+
279
+ self.set_block_state(state, block_state)
280
+ return components, state
281
+
282
+
283
+ # auto_docstring
284
+ class Ideogram4SetTimestepsStep(ModularPipelineBlocks):
285
+ """
286
+ Step that sets the resolution-aware logit-normal sigma schedule on the scheduler and resolves the per-step guidance
287
+ weights.
288
+
289
+ Components:
290
+ scheduler (`FlowMatchEulerDiscreteScheduler`)
291
+
292
+ Inputs:
293
+ num_inference_steps (`int`, *optional*, defaults to 48):
294
+ The number of denoising steps.
295
+ height (`int`):
296
+ The height in pixels of the generated image.
297
+ width (`int`):
298
+ The width in pixels of the generated image.
299
+ mu (`float`, *optional*, defaults to 0.0):
300
+ Base mean of the logit-normal schedule.
301
+ std (`float`, *optional*, defaults to 1.5):
302
+ Std of the logit-normal schedule.
303
+ guidance_schedule (`list`, *optional*, defaults to (7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
304
+ 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
305
+ 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 3.0, 3.0, 3.0)):
306
+ Per-step guidance scale schedule (length num_inference_steps).
307
+
308
+ Outputs:
309
+ timesteps (`Tensor`):
310
+ The denoising timesteps.
311
+ gw (`Tensor`):
312
+ Per-step guidance weights (num_inference_steps,).
313
+ """
314
+
315
+ model_name = "ideogram4"
316
+
317
+ @property
318
+ def description(self) -> str:
319
+ return (
320
+ "Step that sets the resolution-aware logit-normal sigma schedule on the scheduler and resolves the "
321
+ "per-step guidance weights."
322
+ )
323
+
324
+ @property
325
+ def expected_components(self) -> list[ComponentSpec]:
326
+ return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
327
+
328
+ @property
329
+ def inputs(self) -> list[InputParam]:
330
+ return [
331
+ InputParam.template("num_inference_steps", default=48),
332
+ InputParam.template("height", required=True),
333
+ InputParam.template("width", required=True),
334
+ InputParam(name="mu", default=0.0, type_hint=float, description="Base mean of the logit-normal schedule."),
335
+ InputParam(name="std", default=1.5, type_hint=float, description="Std of the logit-normal schedule."),
336
+ InputParam(
337
+ name="guidance_schedule",
338
+ default=DEFAULT_GUIDANCE_SCHEDULE,
339
+ type_hint=list,
340
+ description="Per-step guidance scale schedule (length num_inference_steps).",
341
+ ),
342
+ ]
343
+
344
+ @property
345
+ def intermediate_outputs(self) -> list[OutputParam]:
346
+ return [
347
+ OutputParam(name="timesteps", type_hint=torch.Tensor, description="The denoising timesteps."),
348
+ OutputParam(
349
+ name="gw", type_hint=torch.Tensor, description="Per-step guidance weights (num_inference_steps,)."
350
+ ),
351
+ ]
352
+
353
+ @torch.no_grad()
354
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
355
+ block_state = self.get_block_state(state)
356
+
357
+ device = components._execution_device
358
+ if len(block_state.guidance_schedule) != block_state.num_inference_steps:
359
+ raise ValueError(
360
+ f"`guidance_schedule` must have length `num_inference_steps` ({block_state.num_inference_steps}), "
361
+ f"got {len(block_state.guidance_schedule)}."
362
+ )
363
+
364
+ schedule_mu = _resolution_aware_mu(height=block_state.height, width=block_state.width, base_mu=block_state.mu)
365
+ sigmas = _logit_normal_sigmas(block_state.num_inference_steps, schedule_mu, std=block_state.std, device=device)
366
+ components.scheduler.set_timesteps(sigmas=sigmas.tolist(), device=device)
367
+
368
+ block_state.timesteps = components.scheduler.timesteps
369
+ block_state.gw = torch.as_tensor(block_state.guidance_schedule, dtype=torch.float32, device=device)
370
+
371
+ self.set_block_state(state, block_state)
372
+ return components, state
373
+
374
+
375
+ # auto_docstring
376
+ class Ideogram4PrepareAdditionalInputsStep(ModularPipelineBlocks):
377
+ """
378
+ Step that prepares the additional denoiser inputs from the packed-sequence layout: the conditional
379
+ encoder_hidden_states (text features packed with image padding) and the position_ids/segment_ids/indicator, plus
380
+ the unconditional (image-only) counterparts. Place after prepare_latents.
381
+
382
+ Inputs:
383
+ height (`int`):
384
+ The height in pixels of the generated image.
385
+ width (`int`):
386
+ The width in pixels of the generated image.
387
+ text_features (`Tensor`):
388
+ Batch-expanded text features.
389
+ text_lengths (`list`):
390
+ Batch-expanded text-token counts.
391
+ batch_size (`int`):
392
+ Effective batch size.
393
+
394
+ Outputs:
395
+ prompt_embeds (`Tensor`):
396
+ Packed conditional encoder_hidden_states (B, total_seq, dim).
397
+ position_ids (`Tensor`):
398
+ Conditional 3-axis MRoPE position ids.
399
+ segment_ids (`Tensor`):
400
+ Conditional block-diagonal segment ids.
401
+ indicator (`Tensor`):
402
+ Conditional per-token text/image/pad role.
403
+ negative_prompt_embeds (`Tensor`):
404
+ Unconditional (zeroed) text features (B, num_image_tokens, dim).
405
+ negative_position_ids (`Tensor`):
406
+ Unconditional position ids (image region).
407
+ negative_segment_ids (`Tensor`):
408
+ Unconditional segment ids (image region).
409
+ negative_indicator (`Tensor`):
410
+ Unconditional indicator (image region).
411
+ """
412
+
413
+ model_name = "ideogram4"
414
+
415
+ @property
416
+ def description(self) -> str:
417
+ return (
418
+ "Step that prepares the additional denoiser inputs from the packed-sequence layout: the conditional "
419
+ "encoder_hidden_states (text features packed with image padding) and the position_ids/segment_ids/"
420
+ "indicator, plus the unconditional (image-only) counterparts. Place after prepare_latents."
421
+ )
422
+
423
+ @property
424
+ def inputs(self) -> list[InputParam]:
425
+ return [
426
+ InputParam.template("height", required=True),
427
+ InputParam.template("width", required=True),
428
+ InputParam(
429
+ name="text_features",
430
+ required=True,
431
+ type_hint=torch.Tensor,
432
+ description="Batch-expanded text features.",
433
+ ),
434
+ InputParam(
435
+ name="text_lengths", required=True, type_hint=list, description="Batch-expanded text-token counts."
436
+ ),
437
+ InputParam(name="batch_size", required=True, type_hint=int, description="Effective batch size."),
438
+ ]
439
+
440
+ @property
441
+ def intermediate_outputs(self) -> list[OutputParam]:
442
+ return [
443
+ OutputParam(
444
+ name="prompt_embeds",
445
+ type_hint=torch.Tensor,
446
+ description="Packed conditional encoder_hidden_states (B, total_seq, dim).",
447
+ ),
448
+ OutputParam(
449
+ name="position_ids", type_hint=torch.Tensor, description="Conditional 3-axis MRoPE position ids."
450
+ ),
451
+ OutputParam(
452
+ name="segment_ids", type_hint=torch.Tensor, description="Conditional block-diagonal segment ids."
453
+ ),
454
+ OutputParam(
455
+ name="indicator", type_hint=torch.Tensor, description="Conditional per-token text/image/pad role."
456
+ ),
457
+ OutputParam(
458
+ name="negative_prompt_embeds",
459
+ type_hint=torch.Tensor,
460
+ description="Unconditional (zeroed) text features (B, num_image_tokens, dim).",
461
+ ),
462
+ OutputParam(
463
+ name="negative_position_ids",
464
+ type_hint=torch.Tensor,
465
+ description="Unconditional position ids (image region).",
466
+ ),
467
+ OutputParam(
468
+ name="negative_segment_ids",
469
+ type_hint=torch.Tensor,
470
+ description="Unconditional segment ids (image region).",
471
+ ),
472
+ OutputParam(
473
+ name="negative_indicator",
474
+ type_hint=torch.Tensor,
475
+ description="Unconditional indicator (image region).",
476
+ ),
477
+ ]
478
+
479
+ @staticmethod
480
+ # Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4.Ideogram4Pipeline._prepare_ids
481
+ def _prepare_ids(
482
+ text_lengths: list[int],
483
+ grid_h: int,
484
+ grid_w: int,
485
+ max_text_tokens: int,
486
+ device: torch.device,
487
+ ) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
488
+ """Build the packed `[left-pad][text][image]` layout from the per-prompt text lengths and the image grid.
489
+
490
+ Returns `position_ids` (3-axis MRoPE), `segment_ids` (block-diagonal attention) and `indicator` (per-token
491
+ text/image/pad role).
492
+ """
493
+ batch_size = len(text_lengths)
494
+ num_image_tokens = grid_h * grid_w
495
+ total_seq_len = max_text_tokens + num_image_tokens
496
+
497
+ # Image position ids (t=0, h, w); offset keeps them disjoint from text positions.
498
+ h_idx = torch.arange(grid_h).view(-1, 1).expand(grid_h, grid_w).reshape(-1)
499
+ w_idx = torch.arange(grid_w).view(1, -1).expand(grid_h, grid_w).reshape(-1)
500
+ t_idx = torch.zeros_like(h_idx)
501
+ image_pos = torch.stack([t_idx, h_idx, w_idx], dim=1) + IMAGE_POSITION_OFFSET
502
+
503
+ position_ids = torch.zeros(batch_size, total_seq_len, 3, dtype=torch.long)
504
+ segment_ids = torch.full((batch_size, total_seq_len), SEQUENCE_PADDING_INDICATOR, dtype=torch.long)
505
+ indicator = torch.zeros(batch_size, total_seq_len, dtype=torch.long)
506
+
507
+ for b, num_text in enumerate(text_lengths):
508
+ offset = max_text_tokens - num_text
509
+
510
+ text_pos = torch.arange(num_text)
511
+ text_pos_3d = torch.stack([text_pos, text_pos, text_pos], dim=1)
512
+ position_ids[b, offset : offset + num_text] = text_pos_3d
513
+ position_ids[b, offset + num_text :] = image_pos
514
+
515
+ indicator[b, offset : offset + num_text] = LLM_TOKEN_INDICATOR
516
+ indicator[b, offset + num_text :] = OUTPUT_IMAGE_INDICATOR
517
+
518
+ segment_ids[b, offset : offset + num_text + num_image_tokens] = 1
519
+
520
+ return position_ids.to(device), segment_ids.to(device), indicator.to(device)
521
+
522
+ @torch.no_grad()
523
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
524
+ block_state = self.get_block_state(state)
525
+
526
+ device = components._execution_device
527
+ patch = components.patch_size
528
+ grid_h = block_state.height // (components.vae_scale_factor * patch)
529
+ grid_w = block_state.width // (components.vae_scale_factor * patch)
530
+ num_image_tokens = grid_h * grid_w
531
+
532
+ text_features = block_state.text_features
533
+ max_text_tokens = text_features.shape[1]
534
+ feature_dim = text_features.shape[-1]
535
+
536
+ position_ids, segment_ids, indicator = self._prepare_ids(
537
+ block_state.text_lengths, grid_h, grid_w, max_text_tokens, device
538
+ )
539
+
540
+ # Pack the text features into the full sequence; image positions carry no text features.
541
+ image_feature_padding = torch.zeros(
542
+ block_state.batch_size, num_image_tokens, feature_dim, dtype=text_features.dtype, device=device
543
+ )
544
+ block_state.prompt_embeds = torch.cat([text_features, image_feature_padding], dim=1)
545
+
546
+ # Unconditional (image-only) branch, derived from the conditioning.
547
+ block_state.negative_prompt_embeds = torch.zeros(
548
+ block_state.batch_size, num_image_tokens, feature_dim, dtype=text_features.dtype, device=device
549
+ )
550
+ block_state.position_ids = position_ids
551
+ block_state.segment_ids = segment_ids
552
+ block_state.indicator = indicator
553
+ block_state.negative_position_ids = position_ids[:, max_text_tokens:]
554
+ block_state.negative_segment_ids = segment_ids[:, max_text_tokens:]
555
+ block_state.negative_indicator = indicator[:, max_text_tokens:]
556
+
557
+ self.set_block_state(state, block_state)
558
+ return components, state
diffusers_src/src/diffusers/modular_pipelines/ideogram4/decoders.py ADDED
@@ -0,0 +1,112 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+
18
+ from ...configuration_utils import FrozenDict
19
+ from ...image_processor import VaeImageProcessor
20
+ from ...models import AutoencoderKLFlux2
21
+ from ...utils import logging
22
+ from ..modular_pipeline import ModularPipelineBlocks, PipelineState
23
+ from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
24
+ from .modular_pipeline import Ideogram4ModularPipeline
25
+
26
+
27
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
28
+
29
+
30
+ # auto_docstring
31
+ class Ideogram4DecodeStep(ModularPipelineBlocks):
32
+ """
33
+ Step that decodes the unpatchified (B, ae_channels, H, W) latents into images: de-normalizes with the VAE
34
+ batch-norm statistics and decodes through the VAE.
35
+
36
+ Components:
37
+ vae (`AutoencoderKLFlux2`) image_processor (`VaeImageProcessor`)
38
+
39
+ Inputs:
40
+ output_type (`str`, *optional*, defaults to pil):
41
+ Output format: 'pil', 'np', 'pt'.
42
+ latents (`Tensor`):
43
+ The unpatchified (B, ae_channels, H, W) latents to decode, from the after-denoise step.
44
+
45
+ Outputs:
46
+ images (`list`):
47
+ Generated images.
48
+ """
49
+
50
+ model_name = "ideogram4"
51
+
52
+ @property
53
+ def description(self) -> str:
54
+ return (
55
+ "Step that decodes the unpatchified (B, ae_channels, H, W) latents into images: de-normalizes with the "
56
+ "VAE batch-norm statistics and decodes through the VAE."
57
+ )
58
+
59
+ @property
60
+ def expected_components(self) -> list[ComponentSpec]:
61
+ return [
62
+ ComponentSpec("vae", AutoencoderKLFlux2),
63
+ ComponentSpec(
64
+ "image_processor",
65
+ VaeImageProcessor,
66
+ config=FrozenDict({"vae_scale_factor": 16}),
67
+ default_creation_method="from_config",
68
+ ),
69
+ ]
70
+
71
+ @property
72
+ def inputs(self) -> list[InputParam]:
73
+ return [
74
+ InputParam.template("output_type", default="pil"),
75
+ InputParam(
76
+ name="latents",
77
+ required=True,
78
+ type_hint=torch.Tensor,
79
+ description="The unpatchified (B, ae_channels, H, W) latents to decode, from the after-denoise step.",
80
+ ),
81
+ ]
82
+
83
+ @property
84
+ def intermediate_outputs(self) -> list[OutputParam]:
85
+ return [OutputParam.template("images")]
86
+
87
+ @torch.no_grad()
88
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
89
+ block_state = self.get_block_state(state)
90
+
91
+ z = block_state.latents
92
+ patch = components.patch_size
93
+ ae_channels = z.shape[1]
94
+ grid_h, grid_w = z.shape[2] // patch, z.shape[3] // patch
95
+
96
+ # VAE bn stores per-channel statistics over the packed channels, laid out as (patch_row, patch_col,
97
+ # ae_channel). Reshape them into an (ae_channels, patch, patch) tile and repeat across the grid so the
98
+ # denormalization on the unpatchified latents matches the packed-space statistics.
99
+ bn_mean = components.vae.bn.running_mean.view(patch, patch, ae_channels).permute(2, 0, 1)
100
+ bn_std = torch.sqrt(components.vae.bn.running_var + components.vae.config.batch_norm_eps)
101
+ bn_std = bn_std.view(patch, patch, ae_channels).permute(2, 0, 1)
102
+ bn_mean = bn_mean.repeat(1, grid_h, grid_w).to(device=z.device, dtype=z.dtype)
103
+ bn_std = bn_std.repeat(1, grid_h, grid_w).to(device=z.device, dtype=z.dtype)
104
+ z = z * bn_std + bn_mean
105
+
106
+ decoded = components.vae.decode(z.to(components.vae.dtype), return_dict=False)[0]
107
+ block_state.images = components.image_processor.postprocess(
108
+ decoded.float(), output_type=block_state.output_type
109
+ )
110
+
111
+ self.set_block_state(state, block_state)
112
+ return components, state
diffusers_src/src/diffusers/modular_pipelines/ideogram4/denoise.py ADDED
@@ -0,0 +1,363 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+
18
+ from ...models.transformers.transformer_ideogram4 import Ideogram4Transformer2DModel
19
+ from ...schedulers import FlowMatchEulerDiscreteScheduler
20
+ from ...utils import logging
21
+ from ..modular_pipeline import (
22
+ BlockState,
23
+ LoopSequentialPipelineBlocks,
24
+ ModularPipelineBlocks,
25
+ PipelineState,
26
+ )
27
+ from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
28
+ from .modular_pipeline import Ideogram4ModularPipeline
29
+
30
+
31
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
32
+
33
+
34
+ class Ideogram4LoopBeforeDenoiser(ModularPipelineBlocks):
35
+ model_name = "ideogram4"
36
+
37
+ @property
38
+ def description(self) -> str:
39
+ return (
40
+ "Within the denoising loop: build the conditional packed input `[text-padding][image latents]` and the "
41
+ "model timestep. Compose into the `sub_blocks` of `Ideogram4DenoiseLoopWrapper`."
42
+ )
43
+
44
+ @property
45
+ def expected_components(self) -> list[ComponentSpec]:
46
+ return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
47
+
48
+ @property
49
+ def inputs(self) -> list[InputParam]:
50
+ return [
51
+ InputParam(name="latents", required=True, type_hint=torch.Tensor, description="Packed image latents."),
52
+ InputParam(
53
+ name="position_ids", required=True, type_hint=torch.Tensor, description="Conditional position ids."
54
+ ),
55
+ InputParam(name="batch_size", required=True, type_hint=int, description="Effective batch size."),
56
+ ]
57
+
58
+ @torch.no_grad()
59
+ def __call__(self, components: Ideogram4ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
60
+ # Conditional packed sequence is [text-padding][image latents]; text region length = total - image tokens.
61
+ max_text_tokens = block_state.position_ids.shape[1] - block_state.latents.shape[1]
62
+ text_z_padding = torch.zeros(
63
+ block_state.latents.shape[0],
64
+ max_text_tokens,
65
+ block_state.latents.shape[-1],
66
+ dtype=block_state.latents.dtype,
67
+ device=block_state.latents.device,
68
+ )
69
+ block_state.pos_z = torch.cat([text_z_padding, block_state.latents], dim=1)
70
+ block_state.max_text_tokens = max_text_tokens
71
+
72
+ # Map sigma-domain timestep to model time t in [0, 1] (0 = noise, 1 = clean data).
73
+ num_train_timesteps = components.scheduler.config.num_train_timesteps
74
+ t_model = 1.0 - (t.float() / num_train_timesteps)
75
+ block_state.t_model = t_model.expand(block_state.batch_size)
76
+ return components, block_state
77
+
78
+
79
+ class Ideogram4LoopDenoiser(ModularPipelineBlocks):
80
+ model_name = "ideogram4"
81
+
82
+ @property
83
+ def description(self) -> str:
84
+ return (
85
+ "Within the denoising loop: run the conditional `transformer` on the full packed sequence and the "
86
+ "`unconditional_transformer` on the image-only sequence, then blend with the per-step guidance weight "
87
+ "(asymmetric CFG, no guider). Compose into `Ideogram4DenoiseLoopWrapper`."
88
+ )
89
+
90
+ @property
91
+ def expected_components(self) -> list[ComponentSpec]:
92
+ return [
93
+ ComponentSpec("transformer", Ideogram4Transformer2DModel),
94
+ ComponentSpec("unconditional_transformer", Ideogram4Transformer2DModel),
95
+ ]
96
+
97
+ @property
98
+ def inputs(self) -> list[InputParam]:
99
+ return [
100
+ InputParam(
101
+ name="prompt_embeds",
102
+ required=True,
103
+ type_hint=torch.Tensor,
104
+ description="Packed conditional encoder_hidden_states.",
105
+ ),
106
+ InputParam(
107
+ name="position_ids",
108
+ required=True,
109
+ type_hint=torch.Tensor,
110
+ description="Conditional 3-axis MRoPE position ids.",
111
+ ),
112
+ InputParam(
113
+ name="segment_ids",
114
+ required=True,
115
+ type_hint=torch.Tensor,
116
+ description="Conditional block-diagonal segment ids.",
117
+ ),
118
+ InputParam(
119
+ name="indicator",
120
+ required=True,
121
+ type_hint=torch.Tensor,
122
+ description="Conditional per-token text/image/pad role.",
123
+ ),
124
+ InputParam(
125
+ name="negative_prompt_embeds",
126
+ required=True,
127
+ type_hint=torch.Tensor,
128
+ description="Unconditional (zeroed) text features.",
129
+ ),
130
+ InputParam(
131
+ name="negative_position_ids",
132
+ required=True,
133
+ type_hint=torch.Tensor,
134
+ description="Unconditional position ids (image region).",
135
+ ),
136
+ InputParam(
137
+ name="negative_segment_ids",
138
+ required=True,
139
+ type_hint=torch.Tensor,
140
+ description="Unconditional segment ids (image region).",
141
+ ),
142
+ InputParam(
143
+ name="negative_indicator",
144
+ required=True,
145
+ type_hint=torch.Tensor,
146
+ description="Unconditional indicator (image region).",
147
+ ),
148
+ InputParam(name="gw", required=True, type_hint=torch.Tensor, description="Per-step guidance weights."),
149
+ InputParam(name="latents", required=True, type_hint=torch.Tensor, description="Packed image latents."),
150
+ ]
151
+
152
+ @torch.no_grad()
153
+ def __call__(self, components: Ideogram4ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
154
+ transformer = components.transformer
155
+ unconditional_transformer = components.unconditional_transformer
156
+
157
+ # Conditional pass operates on the full packed sequence; the velocity is the image-token region.
158
+ pos_out = transformer(
159
+ hidden_states=block_state.pos_z.to(transformer.dtype),
160
+ timestep=block_state.t_model.to(transformer.dtype),
161
+ encoder_hidden_states=block_state.prompt_embeds.to(transformer.dtype),
162
+ position_ids=block_state.position_ids,
163
+ segment_ids=block_state.segment_ids,
164
+ indicator=block_state.indicator,
165
+ return_dict=False,
166
+ )[0]
167
+ pos_v = pos_out[:, block_state.max_text_tokens :].to(torch.float32)
168
+
169
+ # Unconditional pass uses the image-only positions with zeroed text features.
170
+ neg_v = unconditional_transformer(
171
+ hidden_states=block_state.latents.to(unconditional_transformer.dtype),
172
+ timestep=block_state.t_model.to(unconditional_transformer.dtype),
173
+ encoder_hidden_states=block_state.negative_prompt_embeds.to(unconditional_transformer.dtype),
174
+ position_ids=block_state.negative_position_ids,
175
+ segment_ids=block_state.negative_segment_ids,
176
+ indicator=block_state.negative_indicator,
177
+ return_dict=False,
178
+ )[0].to(torch.float32)
179
+
180
+ gw_i = block_state.gw[i]
181
+ v = gw_i * pos_v + (1.0 - gw_i) * neg_v
182
+ # The scheduler integrates `-v` (Ideogram predicts velocity v = x0 - noise).
183
+ block_state.noise_pred = -v
184
+ return components, block_state
185
+
186
+
187
+ class Ideogram4LoopAfterDenoiser(ModularPipelineBlocks):
188
+ model_name = "ideogram4"
189
+
190
+ @property
191
+ def description(self) -> str:
192
+ return "Within the denoising loop: scheduler step. Compose into `Ideogram4DenoiseLoopWrapper`."
193
+
194
+ @property
195
+ def expected_components(self) -> list[ComponentSpec]:
196
+ return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
197
+
198
+ @property
199
+ def intermediate_outputs(self) -> list[OutputParam]:
200
+ return [OutputParam(name="latents", type_hint=torch.Tensor, description="The denoised latents.")]
201
+
202
+ @torch.no_grad()
203
+ def __call__(self, components: Ideogram4ModularPipeline, block_state: BlockState, i: int, t: torch.Tensor):
204
+ block_state.latents = components.scheduler.step(
205
+ block_state.noise_pred, t, block_state.latents, return_dict=False
206
+ )[0]
207
+ return components, block_state
208
+
209
+
210
+ # auto_docstring
211
+ class Ideogram4DenoiseStep(LoopSequentialPipelineBlocks):
212
+ """
213
+ Denoising loop that iteratively denoises the packed image latents over `timesteps`, running both the conditional
214
+ and unconditional transformers and blending with the per-step guidance schedule.
215
+
216
+ Components:
217
+ scheduler (`FlowMatchEulerDiscreteScheduler`) transformer (`Ideogram4Transformer2DModel`)
218
+ unconditional_transformer (`Ideogram4Transformer2DModel`)
219
+
220
+ Inputs:
221
+ timesteps (`Tensor`):
222
+ Denoising timesteps from set_timesteps.
223
+ num_inference_steps (`int`, *optional*, defaults to 48):
224
+ The number of denoising steps.
225
+ latents (`Tensor`):
226
+ Packed image latents.
227
+ position_ids (`Tensor`):
228
+ Conditional position ids.
229
+ batch_size (`int`):
230
+ Effective batch size.
231
+ prompt_embeds (`Tensor`):
232
+ Packed conditional encoder_hidden_states.
233
+ position_ids (`Tensor`):
234
+ Conditional 3-axis MRoPE position ids.
235
+ segment_ids (`Tensor`):
236
+ Conditional block-diagonal segment ids.
237
+ indicator (`Tensor`):
238
+ Conditional per-token text/image/pad role.
239
+ negative_prompt_embeds (`Tensor`):
240
+ Unconditional (zeroed) text features.
241
+ negative_position_ids (`Tensor`):
242
+ Unconditional position ids (image region).
243
+ negative_segment_ids (`Tensor`):
244
+ Unconditional segment ids (image region).
245
+ negative_indicator (`Tensor`):
246
+ Unconditional indicator (image region).
247
+ gw (`Tensor`):
248
+ Per-step guidance weights.
249
+
250
+ Outputs:
251
+ latents (`Tensor`):
252
+ The denoised latents.
253
+ """
254
+
255
+ model_name = "ideogram4"
256
+ block_classes = [Ideogram4LoopBeforeDenoiser, Ideogram4LoopDenoiser, Ideogram4LoopAfterDenoiser]
257
+ block_names = ["before_denoiser", "denoiser", "after_denoiser"]
258
+
259
+ @property
260
+ def description(self) -> str:
261
+ return (
262
+ "Denoising loop that iteratively denoises the packed image latents over `timesteps`, running both the "
263
+ "conditional and unconditional transformers and blending with the per-step guidance schedule."
264
+ )
265
+
266
+ @property
267
+ def loop_expected_components(self) -> list[ComponentSpec]:
268
+ return [ComponentSpec("scheduler", FlowMatchEulerDiscreteScheduler)]
269
+
270
+ @property
271
+ def loop_inputs(self) -> list[InputParam]:
272
+ return [
273
+ InputParam(
274
+ name="timesteps",
275
+ required=True,
276
+ type_hint=torch.Tensor,
277
+ description="Denoising timesteps from set_timesteps.",
278
+ ),
279
+ InputParam.template("num_inference_steps", default=48),
280
+ ]
281
+
282
+ @torch.no_grad()
283
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
284
+ block_state = self.get_block_state(state)
285
+
286
+ with self.progress_bar(total=block_state.num_inference_steps) as progress_bar:
287
+ for i, t in enumerate(block_state.timesteps):
288
+ components, block_state = self.loop_step(components, block_state, i=i, t=t)
289
+ progress_bar.update()
290
+
291
+ self.set_block_state(state, block_state)
292
+ return components, state
293
+
294
+
295
+ # auto_docstring
296
+ class Ideogram4AfterDenoiseStep(ModularPipelineBlocks):
297
+ """
298
+ Step that runs after the denoising loop: unpatchifies the packed image latents (B, num_image_tokens, ae_channels *
299
+ patch ** 2) into a (B, ae_channels, H, W) latent for the decoder.
300
+
301
+ Inputs:
302
+ height (`int`):
303
+ The height in pixels of the generated image.
304
+ width (`int`):
305
+ The width in pixels of the generated image.
306
+ latents (`Tensor`):
307
+ The denoised packed image latents (B, num_image_tokens, latent_dim).
308
+
309
+ Outputs:
310
+ latents (`Tensor`):
311
+ Unpatchified latents (B, ae_channels, H, W) ready for the VAE decoder.
312
+ """
313
+
314
+ model_name = "ideogram4"
315
+
316
+ @property
317
+ def description(self) -> str:
318
+ return (
319
+ "Step that runs after the denoising loop: unpatchifies the packed image latents "
320
+ "(B, num_image_tokens, ae_channels * patch ** 2) into a (B, ae_channels, H, W) latent for the decoder."
321
+ )
322
+
323
+ @property
324
+ def inputs(self) -> list[InputParam]:
325
+ return [
326
+ InputParam.template("height", required=True),
327
+ InputParam.template("width", required=True),
328
+ InputParam(
329
+ name="latents",
330
+ required=True,
331
+ type_hint=torch.Tensor,
332
+ description="The denoised packed image latents (B, num_image_tokens, latent_dim).",
333
+ ),
334
+ ]
335
+
336
+ @property
337
+ def intermediate_outputs(self) -> list[OutputParam]:
338
+ return [
339
+ OutputParam(
340
+ name="latents",
341
+ type_hint=torch.Tensor,
342
+ description="Unpatchified latents (B, ae_channels, H, W) ready for the VAE decoder.",
343
+ )
344
+ ]
345
+
346
+ @torch.no_grad()
347
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
348
+ block_state = self.get_block_state(state)
349
+
350
+ z = block_state.latents
351
+ patch = components.patch_size
352
+ grid_h = block_state.height // (components.vae_scale_factor * patch)
353
+ grid_w = block_state.width // (components.vae_scale_factor * patch)
354
+
355
+ ae_channels = z.shape[-1] // (patch * patch)
356
+ z = z.view(z.shape[0], grid_h, grid_w, patch, patch, ae_channels)
357
+ z = z.permute(0, 5, 1, 3, 2, 4).contiguous()
358
+ z = z.view(z.shape[0], ae_channels, grid_h * patch, grid_w * patch)
359
+
360
+ block_state.latents = z
361
+
362
+ self.set_block_state(state, block_state)
363
+ return components, state
diffusers_src/src/diffusers/modular_pipelines/ideogram4/encoders.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ import torch
17
+ from transformers import Qwen2Tokenizer, Qwen3VLModel
18
+ from transformers.masking_utils import create_causal_mask
19
+
20
+ from ...pipelines.ideogram4.prompt_enhancer import (
21
+ DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
22
+ PROMPT_UPSAMPLE_TEMPERATURE,
23
+ generate_captions,
24
+ graft_lm_head,
25
+ )
26
+ from ...utils import logging
27
+ from ..modular_pipeline import ModularPipelineBlocks, PipelineState
28
+ from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
29
+ from .modular_pipeline import Ideogram4ModularPipeline
30
+
31
+
32
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
33
+
34
+
35
+ # Hidden states of these Qwen3-VL decoder layers are concatenated to form the per-token
36
+ # text conditioning consumed by the Ideogram4 transformer.
37
+ QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
38
+
39
+
40
+ # auto_docstring
41
+ class Ideogram4PromptUpsampleStep(ModularPipelineBlocks):
42
+ """
43
+ Optional step that rewrites the prompt(s) into Ideogram4's native structured JSON caption (the format the model
44
+ is trained on) when ``prompt_upsampling=True``. On first use it grafts a hosted LM head onto the (head-less)
45
+ text encoder to make it generative; install ``outlines`` for schema-constrained captions.
46
+
47
+ Components:
48
+ text_encoder (`Qwen3VLModel`): The Qwen3-VL text encoder. tokenizer (`Qwen2Tokenizer`): The tokenizer
49
+ paired with the text encoder.
50
+
51
+ Inputs:
52
+ prompt (`str`):
53
+ The prompt or prompts to guide image generation.
54
+ prompt_upsampling (`bool`, *optional*, defaults to False):
55
+ If True, rewrite the prompt into the native JSON caption before encoding.
56
+ prompt_upsampling_temperature (`float`, *optional*, defaults to 1.0):
57
+ Sampling temperature for prompt upsampling.
58
+ height (`int`, *optional*):
59
+ Together with width, sets the caption's target aspect ratio.
60
+ width (`int`, *optional*):
61
+ Together with height, sets the caption's target aspect ratio.
62
+ generator (`Generator`, *optional*):
63
+ Reused to make the upsampling reproducible.
64
+
65
+ Outputs:
66
+ prompt (`str`):
67
+ The (possibly upsampled) prompt forwarded to the text encoder.
68
+ """
69
+
70
+ model_name = "ideogram4"
71
+
72
+ def __init__(self, lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO):
73
+ self._lm_head_repo_id = lm_head_repo_id
74
+ # Grafted lazily on first upsample and cached (the encoder body is shared).
75
+ self._prompt_enhancer = None
76
+ self._caption_logits_processor = None
77
+ super().__init__()
78
+
79
+ @property
80
+ def description(self) -> str:
81
+ return (
82
+ "Optional step that rewrites the prompt(s) into Ideogram4's native structured JSON caption when "
83
+ "`prompt_upsampling=True` (the format the model is trained on). On first use it grafts a hosted LM head "
84
+ "onto the text encoder; install `outlines` for schema-constrained captions."
85
+ )
86
+
87
+ @property
88
+ def expected_components(self) -> list[ComponentSpec]:
89
+ return [
90
+ ComponentSpec("text_encoder", Qwen3VLModel, description="The Qwen3-VL text encoder."),
91
+ ComponentSpec("tokenizer", Qwen2Tokenizer, description="The tokenizer paired with the text encoder."),
92
+ ]
93
+
94
+ @property
95
+ def inputs(self) -> list[InputParam]:
96
+ return [
97
+ InputParam.template("prompt", required=True),
98
+ InputParam(
99
+ name="prompt_upsampling",
100
+ type_hint=bool,
101
+ default=False,
102
+ description="If True, rewrite the prompt into Ideogram4's native JSON caption before encoding.",
103
+ ),
104
+ InputParam(
105
+ name="prompt_upsampling_temperature",
106
+ type_hint=float,
107
+ default=PROMPT_UPSAMPLE_TEMPERATURE,
108
+ description="Sampling temperature for prompt upsampling.",
109
+ ),
110
+ InputParam.template("height"),
111
+ InputParam.template("width"),
112
+ InputParam.template("generator"),
113
+ ]
114
+
115
+ @property
116
+ def intermediate_outputs(self) -> list[OutputParam]:
117
+ return [
118
+ OutputParam(
119
+ name="prompt",
120
+ type_hint=list,
121
+ description="The (possibly upsampled) prompt forwarded to the text encoder.",
122
+ ),
123
+ ]
124
+
125
+ @torch.no_grad()
126
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
127
+ block_state = self.get_block_state(state)
128
+
129
+ if block_state.prompt_upsampling:
130
+ if self._prompt_enhancer is None:
131
+ self._prompt_enhancer, self._caption_logits_processor = graft_lm_head(
132
+ components.text_encoder, components.tokenizer, self._lm_head_repo_id
133
+ )
134
+ height = block_state.height or components.default_height
135
+ width = block_state.width or components.default_width
136
+ block_state.prompt = generate_captions(
137
+ self._prompt_enhancer,
138
+ components.tokenizer,
139
+ self._caption_logits_processor,
140
+ block_state.prompt,
141
+ height,
142
+ width,
143
+ temperature=block_state.prompt_upsampling_temperature,
144
+ generator=block_state.generator,
145
+ device=components._execution_device,
146
+ )
147
+
148
+ self.set_block_state(state, block_state)
149
+ return components, state
150
+
151
+
152
+ # auto_docstring
153
+ class Ideogram4TextEncoderStep(ModularPipelineBlocks):
154
+ """
155
+ Text encoder step that tokenizes the prompt(s) and runs the Qwen3-VL text encoder, returning the per-token text
156
+ features (concatenated from a fixed set of activation layers). Only the text tokens are encoded; the packed image
157
+ tokens are appended later (the encoder is causal with image after text, so they never affect the text features).
158
+
159
+ Components:
160
+ text_encoder (`Qwen3VLModel`): The Qwen3-VL text encoder. tokenizer (`Qwen2Tokenizer`): The tokenizer paired
161
+ with the text encoder.
162
+
163
+ Inputs:
164
+ prompt (`str`):
165
+ The prompt or prompts to guide image generation.
166
+ max_sequence_length (`int`, *optional*, defaults to 2048):
167
+ Maximum sequence length for prompt encoding.
168
+
169
+ Outputs:
170
+ text_features (`Tensor`):
171
+ Per-prompt text features (B, max_sequence_length, llm_features_dim), padding zeroed.
172
+ text_lengths (`list`):
173
+ Per-prompt real text-token counts, used to lay out the packed sequence.
174
+ """
175
+
176
+ model_name = "ideogram4"
177
+
178
+ @property
179
+ def description(self) -> str:
180
+ return (
181
+ "Text encoder step that tokenizes the prompt(s) and runs the Qwen3-VL text encoder, returning the "
182
+ "per-token text features (concatenated from a fixed set of activation layers). Only the text tokens are "
183
+ "encoded; the packed image tokens are appended later (the encoder is causal with image after text, so "
184
+ "they never affect the text features)."
185
+ )
186
+
187
+ @property
188
+ def expected_components(self) -> list[ComponentSpec]:
189
+ return [
190
+ ComponentSpec("text_encoder", Qwen3VLModel, description="The Qwen3-VL text encoder."),
191
+ ComponentSpec("tokenizer", Qwen2Tokenizer, description="The tokenizer paired with the text encoder."),
192
+ ]
193
+
194
+ @property
195
+ def inputs(self) -> list[InputParam]:
196
+ return [
197
+ InputParam.template("prompt", required=True),
198
+ InputParam.template("max_sequence_length", default=2048),
199
+ ]
200
+
201
+ @property
202
+ def intermediate_outputs(self) -> list[OutputParam]:
203
+ return [
204
+ OutputParam(
205
+ name="text_features",
206
+ type_hint=torch.Tensor,
207
+ description="Per-prompt text features (B, max_sequence_length, llm_features_dim), padding zeroed.",
208
+ ),
209
+ OutputParam(
210
+ name="text_lengths",
211
+ type_hint=list,
212
+ description="Per-prompt real text-token counts, used to lay out the packed sequence.",
213
+ ),
214
+ ]
215
+
216
+ @staticmethod
217
+ # Copied from diffusers.pipelines.ideogram4.pipeline_ideogram4.Ideogram4Pipeline._get_text_encoder_hidden_states
218
+ def _get_text_encoder_hidden_states(
219
+ text_encoder,
220
+ token_ids: torch.Tensor,
221
+ attention_mask: torch.Tensor,
222
+ pos_2d: torch.Tensor,
223
+ ) -> list[torch.Tensor]:
224
+ """Run the text encoder's decoder layers, returning the hidden states tapped at each activation layer."""
225
+
226
+ language_model = text_encoder.language_model
227
+
228
+ inputs_embeds = language_model.embed_tokens(token_ids)
229
+
230
+ position_ids_4d = pos_2d[None, ...].expand(4, pos_2d.shape[0], -1)
231
+ text_position_ids = position_ids_4d[0]
232
+ mrope_position_ids = position_ids_4d[1:]
233
+
234
+ causal_mask = create_causal_mask(
235
+ config=language_model.config,
236
+ inputs_embeds=inputs_embeds,
237
+ attention_mask=attention_mask,
238
+ past_key_values=None,
239
+ position_ids=text_position_ids,
240
+ )
241
+ position_embeddings = language_model.rotary_emb(inputs_embeds, mrope_position_ids)
242
+
243
+ tap_set = set(QWEN3_VL_ACTIVATION_LAYERS)
244
+ captured: dict[int, torch.Tensor] = {}
245
+ hidden_states = inputs_embeds
246
+ for layer_idx, decoder_layer in enumerate(language_model.layers):
247
+ hidden_states = decoder_layer(
248
+ hidden_states,
249
+ attention_mask=causal_mask,
250
+ position_ids=text_position_ids,
251
+ past_key_values=None,
252
+ position_embeddings=position_embeddings,
253
+ )
254
+ if layer_idx in tap_set:
255
+ captured[layer_idx] = hidden_states
256
+
257
+ return [captured[i] for i in QWEN3_VL_ACTIVATION_LAYERS]
258
+
259
+ @torch.no_grad()
260
+ def __call__(self, components: Ideogram4ModularPipeline, state: PipelineState) -> PipelineState:
261
+ block_state = self.get_block_state(state)
262
+
263
+ device = components._execution_device
264
+ tokenizer = components.tokenizer
265
+ max_text_tokens = block_state.max_sequence_length
266
+
267
+ prompts = [block_state.prompt] if isinstance(block_state.prompt, str) else list(block_state.prompt)
268
+ batch_size = len(prompts)
269
+
270
+ # Tokenize each chat-formatted prompt and left-pad to `max_sequence_length`.
271
+ token_ids = torch.zeros(batch_size, max_text_tokens, dtype=torch.long)
272
+ attention_mask = torch.zeros(batch_size, max_text_tokens, dtype=torch.long)
273
+ text_position_ids = torch.zeros(batch_size, max_text_tokens, dtype=torch.long)
274
+ text_lengths = []
275
+ for b, text_prompt in enumerate(prompts):
276
+ messages = [{"role": "user", "content": [{"type": "text", "text": text_prompt}]}]
277
+ text = tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False)
278
+ toks = tokenizer(text, return_tensors="pt", add_special_tokens=False)["input_ids"][0]
279
+ n = int(toks.shape[0])
280
+ if n > max_text_tokens:
281
+ raise ValueError(f"prompt has {n} tokens, exceeds max_sequence_length={max_text_tokens}")
282
+ text_lengths.append(n)
283
+ offset = max_text_tokens - n
284
+ token_ids[b, offset:] = toks
285
+ attention_mask[b, offset:] = 1
286
+ text_position_ids[b, offset:] = torch.arange(n)
287
+
288
+ token_ids = token_ids.to(device)
289
+ attention_mask = attention_mask.to(device)
290
+ text_position_ids = text_position_ids.to(device)
291
+
292
+ # Run the text encoder, tapping the activation-layer hidden states, then concatenate them into per-token
293
+ # text features (padding zeroed).
294
+ selected = self._get_text_encoder_hidden_states(
295
+ components.text_encoder, token_ids, attention_mask, text_position_ids
296
+ )
297
+ text_features = torch.stack(selected, dim=0).permute(1, 2, 3, 0).reshape(batch_size, max_text_tokens, -1)
298
+ text_features = (text_features * attention_mask.to(text_features.dtype).unsqueeze(-1)).to(torch.float32)
299
+
300
+ block_state.text_features = text_features
301
+ block_state.text_lengths = text_lengths
302
+
303
+ self.set_block_state(state, block_state)
304
+ return components, state
diffusers_src/src/diffusers/modular_pipelines/ideogram4/modular_blocks_ideogram4.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from ...utils import logging
17
+ from ..modular_pipeline import SequentialPipelineBlocks
18
+ from ..modular_pipeline_utils import InsertableDict, OutputParam
19
+ from .before_denoise import (
20
+ Ideogram4PrepareAdditionalInputsStep,
21
+ Ideogram4PrepareLatentsStep,
22
+ Ideogram4SetTimestepsStep,
23
+ Ideogram4TextInputsStep,
24
+ )
25
+ from .decoders import Ideogram4DecodeStep
26
+ from .denoise import Ideogram4AfterDenoiseStep, Ideogram4DenoiseStep
27
+ from .encoders import Ideogram4PromptUpsampleStep, Ideogram4TextEncoderStep
28
+
29
+
30
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
31
+
32
+
33
+ # Core denoise: consumes the per-prompt text features and produces the unpatchified latents
34
+ # (batch/latents/timesteps/ids inputs -> denoising loop -> unpatchify).
35
+ CORE_DENOISE_BLOCKS = InsertableDict(
36
+ [
37
+ ("input", Ideogram4TextInputsStep()),
38
+ ("prepare_latents", Ideogram4PrepareLatentsStep()),
39
+ ("set_timesteps", Ideogram4SetTimestepsStep()),
40
+ ("prepare_additional_inputs", Ideogram4PrepareAdditionalInputsStep()),
41
+ ("denoise", Ideogram4DenoiseStep()),
42
+ ("after_denoise", Ideogram4AfterDenoiseStep()),
43
+ ]
44
+ )
45
+
46
+
47
+ # auto_docstring
48
+ class Ideogram4CoreDenoiseStep(SequentialPipelineBlocks):
49
+ """
50
+ Core denoising workflow for Ideogram4 text-to-image: prepares the batch/latents/timesteps and the packed denoiser
51
+ inputs, runs the asymmetric-CFG denoising loop over the conditional and unconditional transformers, and
52
+ unpatchifies the result for the decoder.
53
+
54
+ Components:
55
+ transformer (`Ideogram4Transformer2DModel`) scheduler (`FlowMatchEulerDiscreteScheduler`)
56
+ unconditional_transformer (`Ideogram4Transformer2DModel`)
57
+
58
+ Inputs:
59
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
60
+ The number of images to generate per prompt.
61
+ text_features (`Tensor`):
62
+ Per-prompt text features from the encoder.
63
+ text_lengths (`list`):
64
+ Per-prompt text-token counts from the encoder.
65
+ latents (`Tensor`, *optional*):
66
+ Pre-generated noisy latents for image generation.
67
+ height (`int`):
68
+ The height in pixels of the generated image.
69
+ width (`int`):
70
+ The width in pixels of the generated image.
71
+ generator (`Generator`, *optional*):
72
+ Torch generator for deterministic generation.
73
+ num_inference_steps (`int`, *optional*, defaults to 48):
74
+ The number of denoising steps.
75
+ mu (`float`, *optional*, defaults to 0.0):
76
+ Base mean of the logit-normal schedule.
77
+ std (`float`, *optional*, defaults to 1.5):
78
+ Std of the logit-normal schedule.
79
+ guidance_schedule (`list`, *optional*, defaults to (7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
80
+ 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
81
+ 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 3.0, 3.0, 3.0)):
82
+ Per-step guidance scale schedule (length num_inference_steps).
83
+
84
+ Outputs:
85
+ latents (`Tensor`):
86
+ Unpatchified (B, ae_channels, H, W) latents.
87
+ """
88
+
89
+ model_name = "ideogram4"
90
+ block_classes = list(CORE_DENOISE_BLOCKS.values())
91
+ block_names = list(CORE_DENOISE_BLOCKS.keys())
92
+
93
+ @property
94
+ def description(self) -> str:
95
+ return (
96
+ "Core denoising workflow for Ideogram4 text-to-image: prepares the batch/latents/timesteps and the packed "
97
+ "denoiser inputs, runs the asymmetric-CFG denoising loop over the conditional and unconditional "
98
+ "transformers, and unpatchifies the result for the decoder."
99
+ )
100
+
101
+ @property
102
+ def outputs(self) -> list[OutputParam]:
103
+ # The only meaningful product of the core step is the unpatchified latents; the batch/timesteps/packed-sequence
104
+ # inputs prepared along the way are consumed within the loop and are not updated by it.
105
+ return [OutputParam.template("latents", description="Unpatchified (B, ae_channels, H, W) latents.")]
106
+
107
+
108
+ # auto_docstring
109
+ class Ideogram4AutoBlocks(SequentialPipelineBlocks):
110
+ """
111
+ Auto Modular pipeline for text-to-image generation using Ideogram4: encode text -> core denoise (asymmetric CFG
112
+ over two transformers) -> decode.
113
+
114
+ Supported workflows:
115
+ - `text2image`: requires `prompt`
116
+
117
+ Components:
118
+ text_encoder (`Qwen3VLModel`): The Qwen3-VL text encoder. tokenizer (`Qwen2Tokenizer`): The tokenizer paired
119
+ with the text encoder. transformer (`Ideogram4Transformer2DModel`) scheduler
120
+ (`FlowMatchEulerDiscreteScheduler`) unconditional_transformer (`Ideogram4Transformer2DModel`) vae
121
+ (`AutoencoderKLFlux2`) image_processor (`VaeImageProcessor`)
122
+
123
+ Inputs:
124
+ prompt (`str`):
125
+ The prompt or prompts to guide image generation.
126
+ prompt_upsampling (`bool`, *optional*, defaults to False):
127
+ Rewrite the prompt into Ideogram4's native structured JSON caption before encoding.
128
+ prompt_upsampling_temperature (`float`, *optional*, defaults to 1.0):
129
+ Sampling temperature for prompt upsampling.
130
+ max_sequence_length (`int`, *optional*, defaults to 2048):
131
+ Maximum sequence length for prompt encoding.
132
+ num_images_per_prompt (`int`, *optional*, defaults to 1):
133
+ The number of images to generate per prompt.
134
+ latents (`Tensor`, *optional*):
135
+ Pre-generated noisy latents for image generation.
136
+ height (`int`):
137
+ The height in pixels of the generated image.
138
+ width (`int`):
139
+ The width in pixels of the generated image.
140
+ generator (`Generator`, *optional*):
141
+ Torch generator for deterministic generation.
142
+ num_inference_steps (`int`, *optional*, defaults to 48):
143
+ The number of denoising steps.
144
+ mu (`float`, *optional*, defaults to 0.0):
145
+ Base mean of the logit-normal schedule.
146
+ std (`float`, *optional*, defaults to 1.5):
147
+ Std of the logit-normal schedule.
148
+ guidance_schedule (`list`, *optional*, defaults to (7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
149
+ 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0,
150
+ 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 3.0, 3.0, 3.0)):
151
+ Per-step guidance scale schedule (length num_inference_steps).
152
+ output_type (`str`, *optional*, defaults to pil):
153
+ Output format: 'pil', 'np', 'pt'.
154
+
155
+ Outputs:
156
+ images (`list`):
157
+ Generated images.
158
+ """
159
+
160
+ model_name = "ideogram4"
161
+ block_classes = [
162
+ Ideogram4PromptUpsampleStep(),
163
+ Ideogram4TextEncoderStep(),
164
+ Ideogram4CoreDenoiseStep(),
165
+ Ideogram4DecodeStep(),
166
+ ]
167
+ block_names = ["prompt_upsample", "text_encoder", "denoise", "decode"]
168
+
169
+ # Workflow map declaring the trigger conditions for each supported workflow.
170
+ # `True` means the workflow triggers when the input is not None.
171
+ _workflow_map = {
172
+ "text2image": {"prompt": True},
173
+ }
174
+
175
+ @property
176
+ def description(self) -> str:
177
+ return (
178
+ "Auto Modular pipeline for text-to-image generation using Ideogram4: (optional) prompt upsampling -> "
179
+ "encode text -> core denoise (asymmetric CFG over two transformers) -> decode."
180
+ )
181
+
182
+ @property
183
+ def outputs(self) -> list[OutputParam]:
184
+ return [OutputParam.template("images")]
diffusers_src/src/diffusers/modular_pipelines/ideogram4/modular_pipeline.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright 2026 Ideogram AI and The HuggingFace Team. All rights reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+
16
+ from ..modular_pipeline import ModularPipeline
17
+
18
+
19
+ class Ideogram4ModularPipeline(ModularPipeline):
20
+ """
21
+ A ModularPipeline for Ideogram4.
22
+
23
+ > [!WARNING] > This is an experimental feature!
24
+ """
25
+
26
+ default_blocks_name = "Ideogram4AutoBlocks"
27
+
28
+ # Ideogram4 patchifies the VAE output by a factor of 2 before feeding the transformer.
29
+ @property
30
+ def patch_size(self):
31
+ return 2
32
+
33
+ @property
34
+ def default_height(self):
35
+ return 2048
36
+
37
+ @property
38
+ def default_width(self):
39
+ return 2048
40
+
41
+ @property
42
+ def vae_scale_factor(self):
43
+ vae_scale_factor = 8
44
+ if getattr(self, "vae", None) is not None:
45
+ vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
46
+ return vae_scale_factor
diffusers_src/src/diffusers/modular_pipelines/modular_pipeline.py CHANGED
@@ -126,6 +126,7 @@ MODULAR_PIPELINE_MAPPING = OrderedDict(
126
  ("flux-kontext", _create_default_map_fn("FluxKontextModularPipeline")),
127
  ("flux2", _create_default_map_fn("Flux2ModularPipeline")),
128
  ("flux2-klein", _flux2_klein_map_fn),
 
129
  ("qwenimage", _create_default_map_fn("QwenImageModularPipeline")),
130
  ("qwenimage-edit", _create_default_map_fn("QwenImageEditModularPipeline")),
131
  ("qwenimage-edit-plus", _create_default_map_fn("QwenImageEditPlusModularPipeline")),
 
126
  ("flux-kontext", _create_default_map_fn("FluxKontextModularPipeline")),
127
  ("flux2", _create_default_map_fn("Flux2ModularPipeline")),
128
  ("flux2-klein", _flux2_klein_map_fn),
129
+ ("ideogram4", _create_default_map_fn("Ideogram4ModularPipeline")),
130
  ("qwenimage", _create_default_map_fn("QwenImageModularPipeline")),
131
  ("qwenimage-edit", _create_default_map_fn("QwenImageEditModularPipeline")),
132
  ("qwenimage-edit-plus", _create_default_map_fn("QwenImageEditPlusModularPipeline")),
diffusers_src/src/diffusers/pipelines/auto_pipeline.py CHANGED
@@ -59,6 +59,7 @@ from .flux2 import Flux2KleinPipeline, Flux2Pipeline
59
  from .glm_image import GlmImagePipeline
60
  from .helios import HeliosPipeline, HeliosPyramidPipeline
61
  from .hunyuandit import HunyuanDiTPipeline
 
62
  from .kandinsky import (
63
  KandinskyCombinedPipeline,
64
  KandinskyImg2ImgCombinedPipeline,
@@ -175,6 +176,7 @@ AUTO_TEXT2IMAGE_PIPELINES_MAPPING = OrderedDict(
175
  ("flux-kontext", FluxKontextPipeline),
176
  ("flux2-klein", Flux2KleinPipeline),
177
  ("flux2", Flux2Pipeline),
 
178
  ("lumina", LuminaPipeline),
179
  ("lumina2", Lumina2Pipeline),
180
  ("chroma", ChromaPipeline),
 
59
  from .glm_image import GlmImagePipeline
60
  from .helios import HeliosPipeline, HeliosPyramidPipeline
61
  from .hunyuandit import HunyuanDiTPipeline
62
+ from .ideogram4 import Ideogram4Pipeline
63
  from .kandinsky import (
64
  KandinskyCombinedPipeline,
65
  KandinskyImg2ImgCombinedPipeline,
 
176
  ("flux-kontext", FluxKontextPipeline),
177
  ("flux2-klein", Flux2KleinPipeline),
178
  ("flux2", Flux2Pipeline),
179
+ ("ideogram4", Ideogram4Pipeline),
180
  ("lumina", LuminaPipeline),
181
  ("lumina2", Lumina2Pipeline),
182
  ("chroma", ChromaPipeline),
diffusers_src/src/diffusers/pipelines/ideogram4/pipeline_ideogram4.py CHANGED
@@ -29,11 +29,16 @@ from ...models.transformers.transformer_ideogram4 import (
29
  Ideogram4Transformer2DModel,
30
  )
31
  from ...schedulers import FlowMatchEulerDiscreteScheduler
32
- from ...utils import is_outlines_available, logging, replace_example_docstring
33
  from ...utils.torch_utils import randn_tensor
34
  from ..pipeline_utils import DiffusionPipeline
35
  from .pipeline_output import Ideogram4PipelineOutput
36
- from .prompt_enhancer import CAPTION_SYSTEM_MESSAGE, CAPTION_USER_TEMPLATE, build_caption_logits_processor
 
 
 
 
 
37
 
38
 
39
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
@@ -43,10 +48,6 @@ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
43
  # text conditioning consumed by the Ideogram4 transformer.
44
  QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
45
 
46
- # LM head grafted onto the (head-less) text encoder for optional prompt upsampling.
47
- DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO = "multimodalart/qwen3-vl-8b-instruct-lm-head"
48
- PROMPT_UPSAMPLE_TEMPERATURE = 1.0
49
-
50
 
51
  EXAMPLE_DOC_STRING = """
52
  Examples:
@@ -161,7 +162,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
161
  """
162
 
163
  model_cpu_offload_seq = "text_encoder->transformer->unconditional_transformer->vae"
164
- _optional_components = []
165
  _callback_tensor_inputs = ["latents"]
166
 
167
  def __init__(
@@ -172,6 +173,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
172
  tokenizer: AutoTokenizer,
173
  transformer: Ideogram4Transformer2DModel,
174
  unconditional_transformer: Ideogram4Transformer2DModel,
 
175
  ) -> None:
176
  super().__init__()
177
 
@@ -182,6 +184,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
182
  tokenizer=tokenizer,
183
  transformer=transformer,
184
  unconditional_transformer=unconditional_transformer,
 
185
  )
186
 
187
  self.vae_scale_factor = (
@@ -191,8 +194,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
191
  self.patch_size = 2
192
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * self.patch_size)
193
 
194
- # Lazily built by `load_prompt_enhancer` for optional prompt upsampling.
195
- self._caption_model = None
196
  self._caption_logits_processor = None
197
 
198
  def load_prompt_enhancer(
@@ -207,82 +209,45 @@ class Ideogram4Pipeline(DiffusionPipeline):
207
  Called automatically by `upsample_prompt` on first use. Generation is constrained to the caption JSON
208
  schema when `outlines` is installed; otherwise it falls back to unconstrained decoding with a warning.
209
  """
210
- from accelerate import init_empty_weights
211
- from huggingface_hub import hf_hub_download
212
- from safetensors.torch import load_file
213
- from transformers import Qwen3VLForConditionalGeneration
214
-
215
- dtype = torch_dtype or self.text_encoder.dtype
216
- head_weight = load_file(hf_hub_download(lm_head_repo_id, lm_head_filename))["lm_head.weight"].to(dtype)
217
-
218
- with init_empty_weights():
219
- caption_model = Qwen3VLForConditionalGeneration(self.text_encoder.config)
220
- caption_model.model = self.text_encoder # reuse the loaded encoder body
221
- lm_head = torch.nn.Linear(head_weight.shape[1], head_weight.shape[0], bias=False)
222
- with torch.no_grad():
223
- lm_head.weight.copy_(head_weight)
224
- caption_model.lm_head = lm_head.to(device=self.text_encoder.device, dtype=dtype)
225
- caption_model.eval()
226
-
227
- if is_outlines_available():
228
- logits_processor = build_caption_logits_processor(caption_model, self.tokenizer)
229
- else:
230
- logits_processor = None
231
- logger.warning(
232
- "`outlines` is not installed; prompt upsampling will run unconstrained and may not return "
233
- "schema-valid JSON. Install with `pip install outlines` for structured captions."
234
- )
235
-
236
- self._caption_model = caption_model
237
- self._caption_logits_processor = logits_processor
238
- return caption_model
239
 
240
  def upsample_prompt(
241
  self,
242
  prompt: str | list[str],
243
  height: int = 2048,
244
  width: int = 2048,
 
245
  max_new_tokens: int = 1024,
 
246
  lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
247
  device: torch.device | None = None,
248
  ) -> list[str]:
249
- """Rewrite each prompt into Ideogram4's native structured JSON caption via the grafted text encoder."""
250
- if self._caption_model is None:
 
 
 
251
  self.load_prompt_enhancer(lm_head_repo_id=lm_head_repo_id)
252
 
253
- device = device or self._caption_model.device
254
- prompts = [prompt] if isinstance(prompt, str) else list(prompt)
255
- divisor = math.gcd(width, height) or 1
256
- aspect_ratio = f"{width // divisor}:{height // divisor}"
257
-
258
- captions = []
259
- for text_prompt in prompts:
260
- messages = [
261
- {"role": "system", "content": CAPTION_SYSTEM_MESSAGE},
262
- {
263
- "role": "user",
264
- "content": CAPTION_USER_TEMPLATE.format(aspect_ratio=aspect_ratio, original_prompt=text_prompt),
265
- },
266
- ]
267
- inputs = self.tokenizer.apply_chat_template(
268
- messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
269
- ).to(device)
270
- generate_kwargs = {
271
- "max_new_tokens": max_new_tokens,
272
- "do_sample": True,
273
- "temperature": PROMPT_UPSAMPLE_TEMPERATURE,
274
- "use_cache": True,
275
- }
276
- if self._caption_logits_processor is not None:
277
- self._caption_logits_processor.reset()
278
- generate_kwargs["logits_processor"] = [self._caption_logits_processor]
279
- generated = self._caption_model.generate(**inputs, **generate_kwargs)
280
- new_tokens = generated[:, inputs["input_ids"].shape[1] :]
281
- captions.append(self.tokenizer.decode(new_tokens[0], skip_special_tokens=True).strip())
282
- return captions
283
 
 
284
  def _prepare_ids(
285
- self,
286
  text_lengths: list[int],
287
  grid_h: int,
288
  grid_w: int,
@@ -323,15 +288,16 @@ class Ideogram4Pipeline(DiffusionPipeline):
323
 
324
  return position_ids.to(device), segment_ids.to(device), indicator.to(device)
325
 
 
326
  def _get_text_encoder_hidden_states(
327
- self,
328
  token_ids: torch.Tensor,
329
  attention_mask: torch.Tensor,
330
  pos_2d: torch.Tensor,
331
  ) -> list[torch.Tensor]:
332
  """Run the text encoder's decoder layers, returning the hidden states tapped at each activation layer."""
333
 
334
- language_model = self.text_encoder.language_model
335
 
336
  inputs_embeds = language_model.embed_tokens(token_ids)
337
 
@@ -405,7 +371,9 @@ class Ideogram4Pipeline(DiffusionPipeline):
405
  text_position_ids = text_position_ids.to(device)
406
 
407
  # Concatenate the tapped activation-layer hidden states into per-token text features, zeroing padding.
408
- selected = self._get_text_encoder_hidden_states(token_ids, attention_mask, text_position_ids)
 
 
409
  text_features = torch.stack(selected, dim=0).permute(1, 2, 3, 0).reshape(batch_size, max_sequence_length, -1)
410
  text_features = (text_features * attention_mask.to(text_features.dtype).unsqueeze(-1)).to(torch.float32)
411
 
@@ -509,6 +477,7 @@ class Ideogram4Pipeline(DiffusionPipeline):
509
  mu: float = 0.0,
510
  std: float = 1.5,
511
  prompt_upsampling: bool = False,
 
512
  max_sequence_length: int = 2048,
513
  num_images_per_prompt: int = 1,
514
  generator: torch.Generator | list[torch.Generator] | None = None,
@@ -547,7 +516,10 @@ class Ideogram4Pipeline(DiffusionPipeline):
547
  prompt_upsampling (`bool`, *optional*, defaults to `False`):
548
  If `True`, rewrite `prompt` into Ideogram4's native structured JSON caption via
549
  [`~Ideogram4Pipeline.upsample_prompt`] before encoding. Requires the prompt-enhancer LM head
550
- (downloaded on first use); install `outlines` for schema-constrained captions.
 
 
 
551
  max_sequence_length (`int`, *optional*, defaults to 2048):
552
  Maximum number of text tokens per prompt.
553
  num_images_per_prompt (`int`, *optional*, defaults to 1):
@@ -591,7 +563,14 @@ class Ideogram4Pipeline(DiffusionPipeline):
591
 
592
  # 0. Optionally rewrite the prompt(s) into Ideogram4's native structured JSON caption.
593
  if prompt_upsampling:
594
- prompt = self.upsample_prompt(prompt, height=height, width=width, device=device)
 
 
 
 
 
 
 
595
 
596
  # 1. Image grid (drives both the packed layout and the latent shape).
597
  grid_h, grid_w = (
 
29
  Ideogram4Transformer2DModel,
30
  )
31
  from ...schedulers import FlowMatchEulerDiscreteScheduler
32
+ from ...utils import logging, replace_example_docstring
33
  from ...utils.torch_utils import randn_tensor
34
  from ..pipeline_utils import DiffusionPipeline
35
  from .pipeline_output import Ideogram4PipelineOutput
36
+ from .prompt_enhancer import (
37
+ DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
38
+ PROMPT_UPSAMPLE_TEMPERATURE,
39
+ generate_captions,
40
+ graft_lm_head,
41
+ )
42
 
43
 
44
  logger = logging.get_logger(__name__) # pylint: disable=invalid-name
 
48
  # text conditioning consumed by the Ideogram4 transformer.
49
  QWEN3_VL_ACTIVATION_LAYERS = (0, 3, 6, 9, 12, 15, 18, 21, 24, 27, 30, 33, 35)
50
 
 
 
 
 
51
 
52
  EXAMPLE_DOC_STRING = """
53
  Examples:
 
162
  """
163
 
164
  model_cpu_offload_seq = "text_encoder->transformer->unconditional_transformer->vae"
165
+ _optional_components = ["prompt_enhancer"]
166
  _callback_tensor_inputs = ["latents"]
167
 
168
  def __init__(
 
173
  tokenizer: AutoTokenizer,
174
  transformer: Ideogram4Transformer2DModel,
175
  unconditional_transformer: Ideogram4Transformer2DModel,
176
+ prompt_enhancer: PreTrainedModel | None = None,
177
  ) -> None:
178
  super().__init__()
179
 
 
184
  tokenizer=tokenizer,
185
  transformer=transformer,
186
  unconditional_transformer=unconditional_transformer,
187
+ prompt_enhancer=prompt_enhancer,
188
  )
189
 
190
  self.vae_scale_factor = (
 
194
  self.patch_size = 2
195
  self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * self.patch_size)
196
 
197
+ # Outlines logits processor derived from `prompt_enhancer`; rebuilt by `load_prompt_enhancer`.
 
198
  self._caption_logits_processor = None
199
 
200
  def load_prompt_enhancer(
 
209
  Called automatically by `upsample_prompt` on first use. Generation is constrained to the caption JSON
210
  schema when `outlines` is installed; otherwise it falls back to unconstrained decoding with a warning.
211
  """
212
+ prompt_enhancer, self._caption_logits_processor = graft_lm_head(
213
+ self.text_encoder, self.tokenizer, lm_head_repo_id, lm_head_filename, torch_dtype
214
+ )
215
+ self.register_modules(prompt_enhancer=prompt_enhancer)
216
+ return prompt_enhancer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
217
 
218
  def upsample_prompt(
219
  self,
220
  prompt: str | list[str],
221
  height: int = 2048,
222
  width: int = 2048,
223
+ temperature: float = PROMPT_UPSAMPLE_TEMPERATURE,
224
  max_new_tokens: int = 1024,
225
+ generator: torch.Generator | list[torch.Generator] | None = None,
226
  lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
227
  device: torch.device | None = None,
228
  ) -> list[str]:
229
+ """Rewrite each prompt into Ideogram4's native structured JSON caption via the grafted text encoder.
230
+
231
+ Pass `generator` (the same one accepted by `__call__`) to make sampling reproducible.
232
+ """
233
+ if self.prompt_enhancer is None:
234
  self.load_prompt_enhancer(lm_head_repo_id=lm_head_repo_id)
235
 
236
+ return generate_captions(
237
+ self.prompt_enhancer,
238
+ self.tokenizer,
239
+ self._caption_logits_processor,
240
+ prompt,
241
+ height,
242
+ width,
243
+ temperature=temperature,
244
+ max_new_tokens=max_new_tokens,
245
+ generator=generator,
246
+ device=device,
247
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
 
249
+ @staticmethod
250
  def _prepare_ids(
 
251
  text_lengths: list[int],
252
  grid_h: int,
253
  grid_w: int,
 
288
 
289
  return position_ids.to(device), segment_ids.to(device), indicator.to(device)
290
 
291
+ @staticmethod
292
  def _get_text_encoder_hidden_states(
293
+ text_encoder,
294
  token_ids: torch.Tensor,
295
  attention_mask: torch.Tensor,
296
  pos_2d: torch.Tensor,
297
  ) -> list[torch.Tensor]:
298
  """Run the text encoder's decoder layers, returning the hidden states tapped at each activation layer."""
299
 
300
+ language_model = text_encoder.language_model
301
 
302
  inputs_embeds = language_model.embed_tokens(token_ids)
303
 
 
371
  text_position_ids = text_position_ids.to(device)
372
 
373
  # Concatenate the tapped activation-layer hidden states into per-token text features, zeroing padding.
374
+ selected = self._get_text_encoder_hidden_states(
375
+ self.text_encoder, token_ids, attention_mask, text_position_ids
376
+ )
377
  text_features = torch.stack(selected, dim=0).permute(1, 2, 3, 0).reshape(batch_size, max_sequence_length, -1)
378
  text_features = (text_features * attention_mask.to(text_features.dtype).unsqueeze(-1)).to(torch.float32)
379
 
 
477
  mu: float = 0.0,
478
  std: float = 1.5,
479
  prompt_upsampling: bool = False,
480
+ prompt_upsampling_temperature: float = PROMPT_UPSAMPLE_TEMPERATURE,
481
  max_sequence_length: int = 2048,
482
  num_images_per_prompt: int = 1,
483
  generator: torch.Generator | list[torch.Generator] | None = None,
 
516
  prompt_upsampling (`bool`, *optional*, defaults to `False`):
517
  If `True`, rewrite `prompt` into Ideogram4's native structured JSON caption via
518
  [`~Ideogram4Pipeline.upsample_prompt`] before encoding. Requires the prompt-enhancer LM head
519
+ (downloaded on first use); install `outlines` for schema-constrained captions. `generator` is
520
+ reused to make the upsampling reproducible.
521
+ prompt_upsampling_temperature (`float`, *optional*, defaults to 1.0):
522
+ Sampling temperature for prompt upsampling when `prompt_upsampling=True`.
523
  max_sequence_length (`int`, *optional*, defaults to 2048):
524
  Maximum number of text tokens per prompt.
525
  num_images_per_prompt (`int`, *optional*, defaults to 1):
 
563
 
564
  # 0. Optionally rewrite the prompt(s) into Ideogram4's native structured JSON caption.
565
  if prompt_upsampling:
566
+ prompt = self.upsample_prompt(
567
+ prompt,
568
+ height=height,
569
+ width=width,
570
+ temperature=prompt_upsampling_temperature,
571
+ generator=generator,
572
+ device=device,
573
+ )
574
 
575
  # 1. Image grid (drives both the packed layout and the latent shape).
576
  grid_h, grid_w = (
diffusers_src/src/diffusers/pipelines/ideogram4/prompt_enhancer.py CHANGED
@@ -20,8 +20,24 @@ Qwen3-VL text encoder grafted with a generative head (see `Ideogram4Pipeline.loa
20
 
21
  This mirrors the role of Flux2's `system_messages.py`, but the target is a constrained JSON object instead of
22
  free text, so `outlines` (an optional dependency) is used to guarantee a schema-valid result when available.
 
 
23
  """
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  # System message that instructs the encoder to emit Ideogram4's native single-line JSON caption.
26
  CAPTION_SYSTEM_MESSAGE = """You convert a short user idea into a structured JSON caption for an image renderer. Output ONE minified single-line JSON object and NOTHING else (no markdown, no commentary).
27
 
@@ -107,3 +123,102 @@ def build_caption_logits_processor(model, tokenizer):
107
 
108
  outlines_model = outlines.from_transformers(model, tokenizer)
109
  return outlines.Generator(outlines_model, Caption).logits_processor
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
  This mirrors the role of Flux2's `system_messages.py`, but the target is a constrained JSON object instead of
22
  free text, so `outlines` (an optional dependency) is used to guarantee a schema-valid result when available.
23
+
24
+ The graft/generate helpers here are shared by `Ideogram4Pipeline` and the modular `Ideogram4PromptUpsampleStep`.
25
  """
26
 
27
+ import math
28
+
29
+ import torch
30
+
31
+ from ...utils import is_outlines_available, logging
32
+
33
+
34
+ logger = logging.get_logger(__name__) # pylint: disable=invalid-name
35
+
36
+ # Qwen3-VL LM head grafted onto the (head-less) text encoder for prompt upsampling.
37
+ DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO = "diffusers/qwen3-vl-8b-instruct-lm-head"
38
+ PROMPT_UPSAMPLE_TEMPERATURE = 1.0
39
+
40
+
41
  # System message that instructs the encoder to emit Ideogram4's native single-line JSON caption.
42
  CAPTION_SYSTEM_MESSAGE = """You convert a short user idea into a structured JSON caption for an image renderer. Output ONE minified single-line JSON object and NOTHING else (no markdown, no commentary).
43
 
 
123
 
124
  outlines_model = outlines.from_transformers(model, tokenizer)
125
  return outlines.Generator(outlines_model, Caption).logits_processor
126
+
127
+
128
+ def graft_lm_head(
129
+ text_encoder,
130
+ tokenizer,
131
+ lm_head_repo_id: str = DEFAULT_PROMPT_ENHANCER_LM_HEAD_REPO,
132
+ lm_head_filename: str = "lm_head.safetensors",
133
+ torch_dtype: torch.dtype | None = None,
134
+ ):
135
+ """Graft a hosted LM head onto the (head-less) Qwen3-VL `text_encoder` to make it generative.
136
+
137
+ Returns `(prompt_enhancer, logits_processor)`. The encoder body is shared (only the head is loaded). The
138
+ logits processor constrains generation to the caption JSON schema when `outlines` is installed; otherwise it
139
+ is `None` and generation runs unconstrained (a warning is logged).
140
+ """
141
+ from accelerate import init_empty_weights
142
+ from huggingface_hub import hf_hub_download
143
+ from safetensors.torch import load_file
144
+ from transformers import Qwen3VLForConditionalGeneration
145
+
146
+ dtype = torch_dtype or text_encoder.dtype
147
+ head_weight = load_file(hf_hub_download(lm_head_repo_id, lm_head_filename))["lm_head.weight"].to(dtype)
148
+
149
+ with init_empty_weights():
150
+ prompt_enhancer = Qwen3VLForConditionalGeneration(text_encoder.config)
151
+ prompt_enhancer.model = text_encoder # reuse the loaded encoder body
152
+ lm_head = torch.nn.Linear(head_weight.shape[1], head_weight.shape[0], bias=False)
153
+ with torch.no_grad():
154
+ lm_head.weight.copy_(head_weight)
155
+ prompt_enhancer.lm_head = lm_head.to(device=text_encoder.device, dtype=dtype)
156
+ prompt_enhancer.eval()
157
+
158
+ if is_outlines_available():
159
+ logits_processor = build_caption_logits_processor(prompt_enhancer, tokenizer)
160
+ else:
161
+ logits_processor = None
162
+ logger.warning(
163
+ "`outlines` is not installed; prompt upsampling will run unconstrained and may not return "
164
+ "schema-valid JSON. Install with `pip install outlines` for structured captions."
165
+ )
166
+ return prompt_enhancer, logits_processor
167
+
168
+
169
+ def generate_captions(
170
+ prompt_enhancer,
171
+ tokenizer,
172
+ logits_processor,
173
+ prompt: str | list[str],
174
+ height: int,
175
+ width: int,
176
+ temperature: float = PROMPT_UPSAMPLE_TEMPERATURE,
177
+ max_new_tokens: int = 1024,
178
+ generator: torch.Generator | list[torch.Generator] | None = None,
179
+ device: torch.device | None = None,
180
+ ) -> list[str]:
181
+ """Rewrite each prompt into the native structured JSON caption with the grafted `prompt_enhancer`.
182
+
183
+ Pass `generator` to make sampling reproducible (a seed is derived from it and used inside a forked RNG so the
184
+ caller's own RNG stream is untouched).
185
+ """
186
+ device = device or prompt_enhancer.device
187
+ prompts = [prompt] if isinstance(prompt, str) else list(prompt)
188
+ divisor = math.gcd(width, height) or 1
189
+ aspect_ratio = f"{width // divisor}:{height // divisor}"
190
+
191
+ sampling_seed = None
192
+ if generator is not None:
193
+ gen = generator[0] if isinstance(generator, list) else generator
194
+ sampling_seed = int(torch.randint(0, 2**63 - 1, (1,), generator=gen, device=gen.device).item())
195
+ fork_devices = [device] if getattr(device, "type", None) == "cuda" else []
196
+
197
+ captions = []
198
+ for i, text_prompt in enumerate(prompts):
199
+ messages = [
200
+ {"role": "system", "content": CAPTION_SYSTEM_MESSAGE},
201
+ {
202
+ "role": "user",
203
+ "content": CAPTION_USER_TEMPLATE.format(aspect_ratio=aspect_ratio, original_prompt=text_prompt),
204
+ },
205
+ ]
206
+ inputs = tokenizer.apply_chat_template(
207
+ messages, add_generation_prompt=True, tokenize=True, return_tensors="pt", return_dict=True
208
+ ).to(device)
209
+ generate_kwargs = {
210
+ "max_new_tokens": max_new_tokens,
211
+ "do_sample": temperature > 0,
212
+ "temperature": temperature,
213
+ "use_cache": True,
214
+ }
215
+ if logits_processor is not None:
216
+ logits_processor.reset()
217
+ generate_kwargs["logits_processor"] = [logits_processor]
218
+ with torch.random.fork_rng(devices=fork_devices, enabled=sampling_seed is not None):
219
+ if sampling_seed is not None:
220
+ torch.manual_seed(sampling_seed + i)
221
+ generated = prompt_enhancer.generate(**inputs, **generate_kwargs)
222
+ new_tokens = generated[:, inputs["input_ids"].shape[1] :]
223
+ captions.append(tokenizer.decode(new_tokens[0], skip_special_tokens=True).strip())
224
+ return captions
diffusers_src/src/diffusers/utils/dummy_torch_and_transformers_objects.py CHANGED
@@ -332,6 +332,36 @@ class HunyuanVideo15ModularPipeline(metaclass=DummyObject):
332
  requires_backends(cls, ["torch", "transformers"])
333
 
334
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
335
  class LTXAutoBlocks(metaclass=DummyObject):
336
  _backends = ["torch", "transformers"]
337
 
 
332
  requires_backends(cls, ["torch", "transformers"])
333
 
334
 
335
+ class Ideogram4AutoBlocks(metaclass=DummyObject):
336
+ _backends = ["torch", "transformers"]
337
+
338
+ def __init__(self, *args, **kwargs):
339
+ requires_backends(self, ["torch", "transformers"])
340
+
341
+ @classmethod
342
+ def from_config(cls, *args, **kwargs):
343
+ requires_backends(cls, ["torch", "transformers"])
344
+
345
+ @classmethod
346
+ def from_pretrained(cls, *args, **kwargs):
347
+ requires_backends(cls, ["torch", "transformers"])
348
+
349
+
350
+ class Ideogram4ModularPipeline(metaclass=DummyObject):
351
+ _backends = ["torch", "transformers"]
352
+
353
+ def __init__(self, *args, **kwargs):
354
+ requires_backends(self, ["torch", "transformers"])
355
+
356
+ @classmethod
357
+ def from_config(cls, *args, **kwargs):
358
+ requires_backends(cls, ["torch", "transformers"])
359
+
360
+ @classmethod
361
+ def from_pretrained(cls, *args, **kwargs):
362
+ requires_backends(cls, ["torch", "transformers"])
363
+
364
+
365
  class LTXAutoBlocks(metaclass=DummyObject):
366
  _backends = ["torch", "transformers"]
367