Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

ideogram4 / diffusers_src /src /diffusers /modular_pipelines /helios /denoise.py

multimodalart HF Staff

Embed diffusers PR source; install locally

b8c861f verified 9 days ago

raw

history blame

43.5 kB

	# Copyright 2025 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import inspect
	import math

	import torch
	import torch.nn.functional as F
	from tqdm.auto import tqdm

	from ...configuration_utils import FrozenDict
	from ...guiders import ClassifierFreeGuidance, ClassifierFreeZeroStarGuidance
	from ...models import HeliosTransformer3DModel
	from ...schedulers import HeliosScheduler
	from ...utils import logging
	from ...utils.torch_utils import randn_tensor
	from ..modular_pipeline import (
	BlockState,
	LoopSequentialPipelineBlocks,
	ModularPipelineBlocks,
	PipelineState,
	)
	from ..modular_pipeline_utils import ComponentSpec, InputParam, OutputParam
	from .before_denoise import calculate_shift
	from .modular_pipeline import HeliosModularPipeline


	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	def sample_block_noise(
	batch_size,
	channel,
	num_frames,
	height,
	width,
	gamma,
	patch_size=(1, 2, 2),
	device=None,
	generator=None,
	):
	"""Generate spatially-correlated block noise for pyramid upsampling correction.

	Uses a multivariate normal distribution with covariance based on `gamma` to produce noise with block structure,
	matching the upsampling artifacts that need correction.
	"""
	# NOTE: A generator must be provided to ensure correct and reproducible results.
	# Creating a default generator here is a fallback only — without a fixed seed,
	# the output will be non-deterministic and may produce incorrect results in CP context.
	if generator is None:
	generator = torch.Generator(device=device)
	elif isinstance(generator, list):
	generator = generator[0]

	_, ph, pw = patch_size
	block_size = ph * pw

	cov = (
	torch.eye(block_size, device=device) * (1 + gamma) - torch.ones(block_size, block_size, device=device) * gamma
	)
	cov += torch.eye(block_size, device=device) * 1e-8
	cov = cov.float() # Upcast to fp32 for numerical stability — cholesky is unreliable in fp16/bf16.

	L = torch.linalg.cholesky(cov)
	block_number = batch_size * channel * num_frames * (height // ph) * (width // pw)
	z = torch.randn(block_number, block_size, device=generator.device, generator=generator).to(device)
	noise = z @ L.T

	noise = noise.view(batch_size, channel, num_frames, height // ph, width // pw, ph, pw)
	noise = noise.permute(0, 1, 2, 3, 5, 4, 6).reshape(batch_size, channel, num_frames, height, width)
	return noise


	# ========================================
	# Chunk Loop Leaf Blocks
	# ========================================


	class HeliosChunkHistorySliceStep(ModularPipelineBlocks):
	"""Slices history latents into short/mid/long for a T2V chunk.

	At k==0 with no image_latents, creates a zero prefix. Otherwise uses image_latents (either provided or captured
	from first chunk by HeliosChunkUpdateStep).
	"""

	model_name = "helios"

	@property
	def description(self) -> str:
	return (
	"T2V history slice: splits history into long/mid/short. At k==0 with no image_latents, "
	"creates a zero prefix; otherwise uses image_latents as prefix for short history."
	)

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam(
	"keep_first_frame",
	default=True,
	type_hint=bool,
	description="Whether to keep the first frame as a prefix in history.",
	),
	InputParam(
	"history_sizes",
	required=True,
	type_hint=list,
	description="Sizes of long/mid/short history buffers for temporal context.",
	),
	InputParam(
	"history_latents",
	required=True,
	type_hint=torch.Tensor,
	description="Accumulated history latents from previous chunks.",
	),
	InputParam("latent_shape", required=True, type_hint=tuple),
	]

	@property
	def intermediate_outputs(self) -> list[OutputParam]:
	return []

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	keep_first_frame = block_state.keep_first_frame
	history_sizes = block_state.history_sizes
	image_latents = block_state.image_latents
	device = components._execution_device

	batch_size, num_channels_latents, _, h_latent, w_latent = block_state.latent_shape

	if keep_first_frame:
	latents_history_long, latents_history_mid, latents_history_1x = block_state.history_latents[
	:, :, -sum(history_sizes) :
	].split(history_sizes, dim=2)
	if image_latents is None and k == 0:
	latents_prefix = torch.zeros(
	batch_size,
	num_channels_latents,
	1,
	h_latent,
	w_latent,
	device=device,
	dtype=torch.float32,
	)
	else:
	latents_prefix = image_latents
	latents_history_short = torch.cat([latents_prefix, latents_history_1x], dim=2)
	else:
	latents_history_long, latents_history_mid, latents_history_short = block_state.history_latents[
	:, :, -sum(history_sizes) :
	].split(history_sizes, dim=2)

	block_state.latents_history_short = latents_history_short
	block_state.latents_history_mid = latents_history_mid
	block_state.latents_history_long = latents_history_long

	return components, block_state


	class HeliosI2VChunkHistorySliceStep(ModularPipelineBlocks):
	"""Slices history latents into short/mid/long for an I2V chunk.

	Always uses image_latents as prefix (assumes history pre-seeded with fake_image_latents).
	"""

	model_name = "helios"

	@property
	def description(self) -> str:
	return (
	"I2V history slice: splits pre-seeded history into long/mid/short, "
	"always using image_latents as prefix for short history."
	)

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam(
	"keep_first_frame",
	default=True,
	type_hint=bool,
	description="Whether to keep the first frame as a prefix in history.",
	),
	InputParam(
	"history_sizes",
	required=True,
	type_hint=list,
	description="Sizes of long/mid/short history buffers for temporal context.",
	),
	InputParam(
	"history_latents",
	required=True,
	type_hint=torch.Tensor,
	description="Accumulated history latents from previous chunks.",
	),
	InputParam(
	"image_latents",
	required=True,
	type_hint=torch.Tensor,
	description="First-frame latents used as prefix for short history.",
	),
	]

	@property
	def intermediate_outputs(self) -> list[OutputParam]:
	return []

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	keep_first_frame = block_state.keep_first_frame
	history_sizes = block_state.history_sizes
	image_latents = block_state.image_latents

	if keep_first_frame:
	latents_history_long, latents_history_mid, latents_history_1x = block_state.history_latents[
	:, :, -sum(history_sizes) :
	].split(history_sizes, dim=2)
	latents_history_short = torch.cat([image_latents, latents_history_1x], dim=2)
	else:
	latents_history_long, latents_history_mid, latents_history_short = block_state.history_latents[
	:, :, -sum(history_sizes) :
	].split(history_sizes, dim=2)

	block_state.latents_history_short = latents_history_short
	block_state.latents_history_mid = latents_history_mid
	block_state.latents_history_long = latents_history_long

	return components, block_state


	class HeliosChunkNoiseGenStep(ModularPipelineBlocks):
	"""Generates noise latents for a chunk using randn_tensor."""

	model_name = "helios"

	@property
	def description(self) -> str:
	return "Generates random noise latents at full resolution for a single chunk."

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam("latent_shape", required=True, type_hint=tuple),
	InputParam.template("generator"),
	]

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	device = components._execution_device
	block_state.latents = randn_tensor(
	block_state.latent_shape, generator=block_state.generator, device=device, dtype=torch.float32
	)
	return components, block_state


	class HeliosPyramidChunkNoiseGenStep(ModularPipelineBlocks):
	"""Generates noise latents and downsamples to smallest pyramid level."""

	model_name = "helios-pyramid"

	@property
	def description(self) -> str:
	return (
	"Generates random noise at full resolution, then downsamples to the smallest "
	"pyramid level via bilinear interpolation."
	)

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam("latent_shape", required=True, type_hint=tuple),
	InputParam(
	"pyramid_num_inference_steps_list",
	default=[10, 10, 10],
	type_hint=list,
	description="Number of denoising steps per pyramid stage.",
	),
	InputParam.template("generator"),
	]

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	device = components._execution_device
	batch_size, num_channels_latents, num_latent_frames, h_latent, w_latent = block_state.latent_shape

	latents = randn_tensor(
	block_state.latent_shape, generator=block_state.generator, device=device, dtype=torch.float32
	)

	# Downsample to smallest pyramid level
	h, w = h_latent, w_latent
	latents = latents.permute(0, 2, 1, 3, 4).reshape(batch_size * num_latent_frames, num_channels_latents, h, w)
	for _ in range(len(block_state.pyramid_num_inference_steps_list) - 1):
	h //= 2
	w //= 2
	latents = F.interpolate(latents, size=(h, w), mode="bilinear") * 2
	block_state.latents = latents.reshape(batch_size, num_latent_frames, num_channels_latents, h, w).permute(
	0, 2, 1, 3, 4
	)

	return components, block_state


	class HeliosChunkSchedulerResetStep(ModularPipelineBlocks):
	"""Resets the scheduler with timesteps for a single chunk."""

	model_name = "helios"

	@property
	def description(self) -> str:
	return "Resets the scheduler with the correct timesteps and shift parameter (mu) for this chunk."

	@property
	def expected_components(self) -> list[ComponentSpec]:
	return [
	ComponentSpec("scheduler", HeliosScheduler),
	]

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam("mu", required=True, type_hint=float),
	InputParam.template("sigmas", required=True),
	InputParam.template("num_inference_steps"),
	]

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	device = components._execution_device
	components.scheduler.set_timesteps(
	block_state.num_inference_steps, device=device, sigmas=block_state.sigmas, mu=block_state.mu
	)
	block_state.timesteps = components.scheduler.timesteps

	return components, block_state


	# ========================================
	# Inner Denoising Blocks
	# ========================================


	class HeliosChunkDenoiseInner(ModularPipelineBlocks):
	"""Inner timestep loop for denoising a single chunk, using guider for guidance."""

	model_name = "helios"

	@property
	def description(self) -> str:
	return (
	"Inner denoising loop that iterates over timesteps for a single chunk. "
	"Uses the guider to manage conditional/unconditional forward passes with cache_context, "
	"applies guidance, and runs scheduler step."
	)

	@property
	def expected_components(self) -> list[ComponentSpec]:
	return [
	ComponentSpec("transformer", HeliosTransformer3DModel),
	ComponentSpec("scheduler", HeliosScheduler),
	ComponentSpec(
	"guider",
	ClassifierFreeGuidance,
	config=FrozenDict({"guidance_scale": 5.0}),
	default_creation_method="from_config",
	),
	]

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam.template("latents"),
	InputParam.template("timesteps"),
	InputParam("prompt_embeds", type_hint=torch.Tensor),
	InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
	InputParam.template("denoiser_input_fields"),
	InputParam.template("num_inference_steps"),
	InputParam.template("attention_kwargs"),
	InputParam.template("generator"),
	]

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	latents = block_state.latents
	timesteps = block_state.timesteps
	num_inference_steps = block_state.num_inference_steps

	transformer_dtype = components.transformer.dtype
	num_warmup_steps = len(timesteps) - num_inference_steps * components.scheduler.order

	# Guider inputs: only encoder_hidden_states differs between cond/uncond
	guider_inputs = {
	"encoder_hidden_states": (block_state.prompt_embeds, block_state.negative_prompt_embeds),
	}

	# Build shared kwargs from denoiser_input_fields (excludes guider-managed ones)
	transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys())
	shared_kwargs = {}
	for field_name, field_value in block_state.denoiser_input_fields.items():
	if field_name in transformer_args and field_name not in guider_inputs:
	shared_kwargs[field_name] = field_value

	# Add loop-internal history latents with dtype casting
	shared_kwargs["latents_history_short"] = block_state.latents_history_short.to(transformer_dtype)
	shared_kwargs["latents_history_mid"] = block_state.latents_history_mid.to(transformer_dtype)
	shared_kwargs["latents_history_long"] = block_state.latents_history_long.to(transformer_dtype)
	shared_kwargs["attention_kwargs"] = block_state.attention_kwargs

	with tqdm(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	timestep = t.expand(latents.shape[0]).to(torch.int64)
	latent_model_input = latents.to(transformer_dtype)

	components.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
	guider_state = components.guider.prepare_inputs(guider_inputs)

	for guider_state_batch in guider_state:
	components.guider.prepare_models(components.transformer)
	cond_kwargs = {k: getattr(guider_state_batch, k) for k in guider_inputs.keys()}

	context_name = getattr(guider_state_batch, components.guider._identifier_key)
	with components.transformer.cache_context(context_name):
	guider_state_batch.noise_pred = components.transformer(
	hidden_states=latent_model_input,
	timestep=timestep,
	return_dict=False,
	**cond_kwargs,
	**shared_kwargs,
	)[0]
	components.guider.cleanup_models(components.transformer)

	noise_pred = components.guider(guider_state)[0]

	# Scheduler step
	latents = components.scheduler.step(
	noise_pred,
	t,
	latents,
	generator=block_state.generator,
	return_dict=False,
	)[0]

	if i == len(timesteps) - 1 or (
	(i + 1) > num_warmup_steps and (i + 1) % components.scheduler.order == 0
	):
	progress_bar.update()

	block_state.latents = latents
	return components, block_state


	class HeliosPyramidChunkDenoiseInner(ModularPipelineBlocks):
	"""Nested pyramid stage loop with inner timestep denoising.

	For each pyramid stage (small -> full resolution):
	1. Upsample latents + block noise correction (stages > 0)
	2. Compute mu from current resolution, set scheduler timesteps
	3. Run timestep denoising loop (same logic as HeliosChunkDenoiseInner)
	"""

	model_name = "helios-pyramid"

	@property
	def description(self) -> str:
	return (
	"Pyramid denoising inner block: loops over pyramid stages from smallest to full resolution. "
	"Each stage upsamples latents (with block noise correction), recomputes scheduler parameters, "
	"and runs the timestep denoising loop."
	)

	@property
	def expected_components(self) -> list[ComponentSpec]:
	return [
	ComponentSpec("transformer", HeliosTransformer3DModel),
	ComponentSpec("scheduler", HeliosScheduler),
	ComponentSpec(
	"guider",
	ClassifierFreeZeroStarGuidance,
	config=FrozenDict({"guidance_scale": 5.0, "zero_init_steps": 2}),
	default_creation_method="from_config",
	),
	]

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam.template("latents"),
	InputParam("prompt_embeds", type_hint=torch.Tensor),
	InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
	InputParam.template("denoiser_input_fields"),
	InputParam(
	"pyramid_num_inference_steps_list",
	default=[10, 10, 10],
	type_hint=list,
	description="Number of denoising steps per pyramid stage.",
	),
	InputParam.template("attention_kwargs"),
	InputParam.template("generator"),
	]

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	device = components._execution_device
	transformer_dtype = components.transformer.dtype
	latents = block_state.latents
	pyramid_num_stages = len(block_state.pyramid_num_inference_steps_list)

	# Guider inputs: only encoder_hidden_states differs between cond/uncond
	guider_inputs = {
	"encoder_hidden_states": (block_state.prompt_embeds, block_state.negative_prompt_embeds),
	}

	# Build shared kwargs from denoiser_input_fields (excludes guider-managed ones)
	transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys())
	shared_kwargs = {}
	for field_name, field_value in block_state.denoiser_input_fields.items():
	if field_name in transformer_args and field_name not in guider_inputs:
	shared_kwargs[field_name] = field_value

	# Add loop-internal history latents with dtype casting
	shared_kwargs["latents_history_short"] = block_state.latents_history_short.to(transformer_dtype)
	shared_kwargs["latents_history_mid"] = block_state.latents_history_mid.to(transformer_dtype)
	shared_kwargs["latents_history_long"] = block_state.latents_history_long.to(transformer_dtype)
	shared_kwargs["attention_kwargs"] = block_state.attention_kwargs

	# Save original zero_init_steps if the guider supports it (e.g. ClassifierFreeZeroStarGuidance).
	# Helios only applies zero init in pyramid stage 0 (lowest resolution), so we disable it
	# for subsequent stages by temporarily setting zero_init_steps=0.
	orig_zero_init_steps = getattr(components.guider, "zero_init_steps", None)

	for i_s in range(pyramid_num_stages):
	# --- Stage setup ---

	# Disable zero init for stages > 0 (only stage 0 should have zero init)
	if orig_zero_init_steps is not None and i_s > 0:
	components.guider.zero_init_steps = 0

	# a. Compute mu from current resolution (before upsample, matching standard pipeline)
	patch_size = components.transformer.config.patch_size
	image_seq_len = (latents.shape[-1] * latents.shape[-2] * latents.shape[-3]) // (
	patch_size[0] * patch_size[1] * patch_size[2]
	)
	mu = calculate_shift(
	image_seq_len,
	components.scheduler.config.get("base_image_seq_len", 256),
	components.scheduler.config.get("max_image_seq_len", 4096),
	components.scheduler.config.get("base_shift", 0.5),
	components.scheduler.config.get("max_shift", 1.15),
	)

	# b. Set scheduler timesteps for this stage
	num_inference_steps = block_state.pyramid_num_inference_steps_list[i_s]
	components.scheduler.set_timesteps(
	num_inference_steps,
	i_s,
	device=device,
	mu=mu,
	)
	timesteps = components.scheduler.timesteps

	# c. Upsample + block noise correction for stages > 0
	if i_s > 0:
	batch_size, num_channels_latents, num_frames, current_h, current_w = latents.shape
	new_h = current_h * 2
	new_w = current_w * 2

	latents = latents.permute(0, 2, 1, 3, 4).reshape(
	batch_size * num_frames, num_channels_latents, current_h, current_w
	)
	latents = F.interpolate(latents, size=(new_h, new_w), mode="nearest")
	latents = latents.reshape(batch_size, num_frames, num_channels_latents, new_h, new_w).permute(
	0, 2, 1, 3, 4
	)

	# Block noise correction
	ori_sigma = 1 - components.scheduler.ori_start_sigmas[i_s]
	gamma = components.scheduler.config.gamma
	alpha = 1 / (math.sqrt(1 + (1 / gamma)) * (1 - ori_sigma) + ori_sigma)
	beta = alpha * (1 - ori_sigma) / math.sqrt(gamma)

	batch_size, num_channels_latents, num_frames, h, w = latents.shape
	noise = sample_block_noise(
	batch_size,
	num_channels_latents,
	num_frames,
	h,
	w,
	gamma,
	patch_size,
	device=device,
	generator=block_state.generator,
	)
	noise = noise.to(dtype=transformer_dtype)
	latents = alpha * latents + beta * noise

	# --- Timestep denoising loop ---
	num_warmup_steps = len(timesteps) - num_inference_steps * components.scheduler.order

	with tqdm(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	timestep = t.expand(latents.shape[0]).to(torch.int64)
	latent_model_input = latents.to(transformer_dtype)

	components.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
	guider_state = components.guider.prepare_inputs(guider_inputs)

	for guider_state_batch in guider_state:
	components.guider.prepare_models(components.transformer)
	cond_kwargs = {kk: getattr(guider_state_batch, kk) for kk in guider_inputs.keys()}

	context_name = getattr(guider_state_batch, components.guider._identifier_key)
	with components.transformer.cache_context(context_name):
	guider_state_batch.noise_pred = components.transformer(
	hidden_states=latent_model_input,
	timestep=timestep,
	return_dict=False,
	**cond_kwargs,
	**shared_kwargs,
	)[0]
	components.guider.cleanup_models(components.transformer)

	noise_pred = components.guider(guider_state)[0]

	# Scheduler step
	latents = components.scheduler.step(
	noise_pred,
	t,
	latents,
	generator=block_state.generator,
	return_dict=False,
	)[0]

	if i == len(timesteps) - 1 or (
	(i + 1) > num_warmup_steps and (i + 1) % components.scheduler.order == 0
	):
	progress_bar.update()

	# Restore original zero_init_steps
	if orig_zero_init_steps is not None:
	components.guider.zero_init_steps = orig_zero_init_steps

	block_state.latents = latents
	return components, block_state


	# ========================================
	# Post-Denoise Update
	# ========================================


	class HeliosChunkUpdateStep(ModularPipelineBlocks):
	"""Updates chunk collection and history after denoising a single chunk."""

	model_name = "helios"

	@property
	def description(self) -> str:
	return (
	"Post-denoising update step: appends the denoised latents to the chunk list, "
	"captures image_latents from the first chunk if needed, and extends history_latents."
	)

	@property
	def expected_components(self) -> list[ComponentSpec]:
	return []

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam("latents", type_hint=torch.Tensor),
	InputParam("history_latents", type_hint=torch.Tensor),
	InputParam("keep_first_frame", default=True, type_hint=bool),
	]

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	# e. Collect denoised latents for this chunk
	block_state.latent_chunks.append(block_state.latents)

	# f. Update history
	if block_state.keep_first_frame and k == 0 and block_state.image_latents is None:
	block_state.image_latents = block_state.latents[:, :, 0:1, :, :]

	block_state.history_latents = torch.cat([block_state.history_latents, block_state.latents], dim=2)

	return components, block_state


	# ========================================
	# Chunk Loop Wrapper
	# ========================================


	class HeliosChunkLoopWrapper(LoopSequentialPipelineBlocks):
	"""Outer chunk loop that iterates over temporal chunks.

	History indices, scheduler params, and history state are prepared by HeliosPrepareHistoryStep and
	HeliosSetTimestepsStep before this block runs. Sub-blocks handle per-chunk preparation, denoising, and history
	updates.
	"""

	model_name = "helios"

	@property
	def description(self) -> str:
	return (
	"Pipeline block that iterates over temporal chunks for progressive video generation. "
	"At each chunk iteration, it runs sub-blocks for preparation, denoising, and history updates."
	)

	@property
	def loop_inputs(self) -> list[InputParam]:
	return [
	InputParam("num_latent_chunk", required=True, type_hint=int),
	]

	@property
	def loop_intermediate_outputs(self) -> list[OutputParam]:
	return [
	OutputParam("latent_chunks", type_hint=list, description="List of per-chunk denoised latent tensors"),
	]

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, state: PipelineState) -> PipelineState:
	block_state = self.get_block_state(state)
	block_state.latent_chunks = []

	if not hasattr(block_state, "image_latents"):
	block_state.image_latents = None

	for k in range(block_state.num_latent_chunk):
	components, block_state = self.loop_step(components, block_state, k=k)

	self.set_block_state(state, block_state)

	return components, state


	# ========================================
	# Composed Chunk Denoise Steps
	# ========================================


	class HeliosChunkDenoiseStep(HeliosChunkLoopWrapper):
	"""T2V chunk-based denoising: history slice -> noise gen -> scheduler reset -> denoise -> update."""

	block_classes = [
	HeliosChunkHistorySliceStep,
	HeliosChunkNoiseGenStep,
	HeliosChunkSchedulerResetStep,
	HeliosChunkDenoiseInner,
	HeliosChunkUpdateStep,
	]
	block_names = ["history_slice", "noise_gen", "scheduler_reset", "denoise_inner", "update_chunk"]

	@property
	def description(self) -> str:
	return (
	"T2V chunk denoise step that iterates over temporal chunks.\n"
	"At each chunk: history_slice -> noise_gen -> scheduler_reset -> denoise_inner -> update_chunk."
	)


	class HeliosI2VChunkDenoiseStep(HeliosChunkLoopWrapper):
	"""I2V chunk-based denoising: I2V history slice -> noise gen -> scheduler reset -> denoise -> update."""

	block_classes = [
	HeliosI2VChunkHistorySliceStep,
	HeliosChunkNoiseGenStep,
	HeliosChunkSchedulerResetStep,
	HeliosChunkDenoiseInner,
	HeliosChunkUpdateStep,
	]
	block_names = ["history_slice", "noise_gen", "scheduler_reset", "denoise_inner", "update_chunk"]

	@property
	def description(self) -> str:
	return (
	"I2V chunk denoise step that iterates over temporal chunks.\n"
	"At each chunk: history_slice (I2V) -> noise_gen -> scheduler_reset -> denoise_inner -> update_chunk."
	)


	class HeliosPyramidDistilledChunkDenoiseInner(ModularPipelineBlocks):
	"""Nested pyramid stage loop with DMD denoising for distilled checkpoints.

	Same progressive multi-resolution strategy as HeliosPyramidChunkDenoiseInner, but:
	- Guidance is disabled (guidance_scale=1.0, no unconditional pass)
	- Supports is_amplify_first_chunk (doubles first chunk's timesteps via scheduler)
	- Tracks start_point_list and passes DMD-specific args to scheduler.step()
	"""

	model_name = "helios-pyramid"

	@property
	def description(self) -> str:
	return (
	"Distilled pyramid denoising inner block for DMD checkpoints. Loops over pyramid stages "
	"from smallest to full resolution with guidance disabled and DMD scheduler support."
	)

	@property
	def expected_components(self) -> list[ComponentSpec]:
	return [
	ComponentSpec("transformer", HeliosTransformer3DModel),
	ComponentSpec("scheduler", HeliosScheduler),
	ComponentSpec(
	"guider",
	ClassifierFreeGuidance,
	config=FrozenDict({"guidance_scale": 1.0}),
	default_creation_method="from_config",
	),
	]

	@property
	def inputs(self) -> list[InputParam]:
	return [
	InputParam.template("latents"),
	InputParam("prompt_embeds", type_hint=torch.Tensor),
	InputParam("negative_prompt_embeds", type_hint=torch.Tensor),
	InputParam.template("denoiser_input_fields"),
	InputParam(
	"pyramid_num_inference_steps_list",
	default=[2, 2, 2],
	type_hint=list,
	description="Number of denoising steps per pyramid stage.",
	),
	InputParam(
	"is_amplify_first_chunk",
	default=True,
	type_hint=bool,
	description="Whether to double the first chunk's timesteps via the scheduler for amplified generation.",
	),
	InputParam.template("attention_kwargs"),
	InputParam.template("generator"),
	]

	@torch.no_grad()
	def __call__(self, components: HeliosModularPipeline, block_state: BlockState, k: int):
	device = components._execution_device
	transformer_dtype = components.transformer.dtype
	latents = block_state.latents
	pyramid_num_stages = len(block_state.pyramid_num_inference_steps_list)
	is_first_chunk = k == 0

	# Track start points for DMD scheduler
	start_point_list = [latents]

	# Guider inputs: only encoder_hidden_states differs between cond/uncond
	guider_inputs = {
	"encoder_hidden_states": (block_state.prompt_embeds, block_state.negative_prompt_embeds),
	}

	# Build shared kwargs from denoiser_input_fields (excludes guider-managed ones)
	transformer_args = set(inspect.signature(components.transformer.forward).parameters.keys())
	shared_kwargs = {}
	for field_name, field_value in block_state.denoiser_input_fields.items():
	if field_name in transformer_args and field_name not in guider_inputs:
	shared_kwargs[field_name] = field_value

	# Add loop-internal history latents with dtype casting
	shared_kwargs["latents_history_short"] = block_state.latents_history_short.to(transformer_dtype)
	shared_kwargs["latents_history_mid"] = block_state.latents_history_mid.to(transformer_dtype)
	shared_kwargs["latents_history_long"] = block_state.latents_history_long.to(transformer_dtype)
	shared_kwargs["attention_kwargs"] = block_state.attention_kwargs

	for i_s in range(pyramid_num_stages):
	# --- Stage setup ---
	patch_size = components.transformer.config.patch_size

	# a. Compute mu from current resolution (before upsample, matching standard pipeline)
	image_seq_len = (latents.shape[-1] * latents.shape[-2] * latents.shape[-3]) // (
	patch_size[0] * patch_size[1] * patch_size[2]
	)
	mu = calculate_shift(
	image_seq_len,
	components.scheduler.config.get("base_image_seq_len", 256),
	components.scheduler.config.get("max_image_seq_len", 4096),
	components.scheduler.config.get("base_shift", 0.5),
	components.scheduler.config.get("max_shift", 1.15),
	)

	# b. Set scheduler timesteps for this stage (with DMD amplification)
	num_inference_steps = block_state.pyramid_num_inference_steps_list[i_s]
	components.scheduler.set_timesteps(
	num_inference_steps,
	i_s,
	device=device,
	mu=mu,
	is_amplify_first_chunk=block_state.is_amplify_first_chunk and is_first_chunk,
	)
	timesteps = components.scheduler.timesteps

	# c. Upsample + block noise correction for stages > 0
	if i_s > 0:
	batch_size, num_channels_latents, num_frames, current_h, current_w = latents.shape
	new_h = current_h * 2
	new_w = current_w * 2

	latents = latents.permute(0, 2, 1, 3, 4).reshape(
	batch_size * num_frames, num_channels_latents, current_h, current_w
	)
	latents = F.interpolate(latents, size=(new_h, new_w), mode="nearest")
	latents = latents.reshape(batch_size, num_frames, num_channels_latents, new_h, new_w).permute(
	0, 2, 1, 3, 4
	)

	# Block noise correction
	ori_sigma = 1 - components.scheduler.ori_start_sigmas[i_s]
	gamma = components.scheduler.config.gamma
	alpha = 1 / (math.sqrt(1 + (1 / gamma)) * (1 - ori_sigma) + ori_sigma)
	beta = alpha * (1 - ori_sigma) / math.sqrt(gamma)

	batch_size, num_channels_latents, num_frames, h, w = latents.shape
	noise = sample_block_noise(
	batch_size,
	num_channels_latents,
	num_frames,
	h,
	w,
	gamma,
	patch_size,
	device=device,
	generator=block_state.generator,
	)
	noise = noise.to(dtype=transformer_dtype)
	latents = alpha * latents + beta * noise

	start_point_list.append(latents)

	# --- Timestep denoising loop ---
	num_warmup_steps = len(timesteps) - num_inference_steps * components.scheduler.order

	with tqdm(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	timestep = t.expand(latents.shape[0]).to(torch.int64)
	latent_model_input = latents.to(transformer_dtype)

	components.guider.set_state(step=i, num_inference_steps=num_inference_steps, timestep=t)
	guider_state = components.guider.prepare_inputs(guider_inputs)

	for guider_state_batch in guider_state:
	components.guider.prepare_models(components.transformer)
	cond_kwargs = {k: getattr(guider_state_batch, k) for k in guider_inputs.keys()}

	context_name = getattr(guider_state_batch, components.guider._identifier_key)
	with components.transformer.cache_context(context_name):
	guider_state_batch.noise_pred = components.transformer(
	hidden_states=latent_model_input,
	timestep=timestep,
	return_dict=False,
	**cond_kwargs,
	**shared_kwargs,
	)[0]
	components.guider.cleanup_models(components.transformer)

	noise_pred = components.guider(guider_state)[0]

	# Scheduler step with DMD args
	latents = components.scheduler.step(
	noise_pred,
	t,
	latents,
	generator=block_state.generator,
	return_dict=False,
	cur_sampling_step=i,
	dmd_noisy_tensor=start_point_list[i_s],
	dmd_sigmas=components.scheduler.sigmas,
	dmd_timesteps=components.scheduler.timesteps,
	all_timesteps=timesteps,
	)[0]

	if i == len(timesteps) - 1 or (
	(i + 1) > num_warmup_steps and (i + 1) % components.scheduler.order == 0
	):
	progress_bar.update()

	block_state.latents = latents
	return components, block_state


	class HeliosPyramidChunkDenoiseStep(HeliosChunkLoopWrapper):
	"""T2V pyramid chunk denoising: history slice -> pyramid noise gen -> pyramid denoise inner -> update."""

	block_classes = [
	HeliosChunkHistorySliceStep,
	HeliosPyramidChunkNoiseGenStep,
	HeliosPyramidChunkDenoiseInner,
	HeliosChunkUpdateStep,
	]
	block_names = ["history_slice", "noise_gen", "denoise_inner", "update_chunk"]

	@property
	def description(self) -> str:
	return (
	"T2V pyramid chunk denoise step that iterates over temporal chunks.\n"
	"At each chunk: history_slice -> noise_gen (pyramid) -> denoise_inner (pyramid stages) -> update_chunk.\n"
	"Denoising starts at the smallest resolution and progressively upsamples."
	)


	class HeliosPyramidI2VChunkDenoiseStep(HeliosChunkLoopWrapper):
	"""I2V pyramid chunk denoising: I2V history slice -> pyramid noise gen -> pyramid denoise inner -> update."""

	block_classes = [
	HeliosI2VChunkHistorySliceStep,
	HeliosPyramidChunkNoiseGenStep,
	HeliosPyramidChunkDenoiseInner,
	HeliosChunkUpdateStep,
	]
	block_names = ["history_slice", "noise_gen", "denoise_inner", "update_chunk"]

	@property
	def description(self) -> str:
	return (
	"I2V pyramid chunk denoise step that iterates over temporal chunks.\n"
	"At each chunk: history_slice (I2V) -> noise_gen (pyramid) -> denoise_inner (pyramid stages) -> update_chunk.\n"
	"Denoising starts at the smallest resolution and progressively upsamples."
	)


	class HeliosPyramidDistilledChunkDenoiseStep(HeliosChunkLoopWrapper):
	"""T2V distilled pyramid chunk denoising with DMD scheduler and no CFG."""

	block_classes = [
	HeliosChunkHistorySliceStep,
	HeliosPyramidChunkNoiseGenStep,
	HeliosPyramidDistilledChunkDenoiseInner,
	HeliosChunkUpdateStep,
	]
	block_names = ["history_slice", "noise_gen", "denoise_inner", "update_chunk"]

	@property
	def description(self) -> str:
	return (
	"T2V distilled pyramid chunk denoise step with DMD scheduler.\n"
	"At each chunk: history_slice -> noise_gen (pyramid) -> denoise_inner (distilled/DMD) -> update_chunk."
	)


	class HeliosPyramidDistilledI2VChunkDenoiseStep(HeliosChunkLoopWrapper):
	"""I2V distilled pyramid chunk denoising with DMD scheduler and no CFG."""

	block_classes = [
	HeliosI2VChunkHistorySliceStep,
	HeliosPyramidChunkNoiseGenStep,
	HeliosPyramidDistilledChunkDenoiseInner,
	HeliosChunkUpdateStep,
	]
	block_names = ["history_slice", "noise_gen", "denoise_inner", "update_chunk"]

	@property
	def description(self) -> str:
	return (
	"I2V distilled pyramid chunk denoise step with DMD scheduler.\n"
	"At each chunk: history_slice (I2V) -> noise_gen (pyramid) -> denoise_inner (distilled/DMD) -> update_chunk."
	)