Spaces:

ideogram-ai
/

ideogram4

Running on Zero

App Files Files Community

ideogram4 / diffusers_src /src /diffusers /models /controlnets /controlnet_cosmos.py

multimodalart HF Staff

Embed diffusers PR source; install locally

b8c861f verified 8 days ago

raw

history blame

13.1 kB

	from dataclasses import dataclass
	from typing import List, Optional, Tuple, Union

	import torch
	import torch.nn as nn

	from ...configuration_utils import ConfigMixin, register_to_config
	from ...loaders import FromOriginalModelMixin
	from ...utils import BaseOutput, is_torchvision_available, logging
	from ..modeling_utils import ModelMixin
	from ..transformers.transformer_cosmos import (
	CosmosEmbedding,
	CosmosLearnablePositionalEmbed,
	CosmosPatchEmbed,
	CosmosRotaryPosEmbed,
	CosmosTransformerBlock,
	)


	if is_torchvision_available():
	from torchvision import transforms

	logger = logging.get_logger(__name__) # pylint: disable=invalid-name


	@dataclass
	class CosmosControlNetOutput(BaseOutput):
	"""
	Output of [`CosmosControlNetModel`].

	Args:
	control_block_samples (`list[torch.Tensor]`):
	List of control block activations to be injected into transformer blocks.
	"""

	control_block_samples: List[torch.Tensor]


	class CosmosControlNetModel(ModelMixin, ConfigMixin, FromOriginalModelMixin):
	r"""
	ControlNet for Cosmos Transfer2.5.

	This model duplicates the shared embedding modules from the transformer (patch_embed, time_embed,
	learnable_pos_embed, img_context_proj) to enable proper CPU offloading. The forward() method computes everything
	internally from raw inputs.
	"""

	_supports_gradient_checkpointing = True
	_skip_layerwise_casting_patterns = ["patch_embed", "patch_embed_base", "time_embed"]
	_no_split_modules = ["CosmosTransformerBlock"]
	_keep_in_fp32_modules = ["learnable_pos_embed"]

	@register_to_config
	def __init__(
	self,
	n_controlnet_blocks: int = 4,
	in_channels: int = 130,
	latent_channels: int = 18, # base latent channels (latents + condition_mask) + padding_mask
	model_channels: int = 2048,
	num_attention_heads: int = 32,
	attention_head_dim: int = 128,
	mlp_ratio: float = 4.0,
	text_embed_dim: int = 1024,
	adaln_lora_dim: int = 256,
	patch_size: Tuple[int, int, int] = (1, 2, 2),
	max_size: Tuple[int, int, int] = (128, 240, 240),
	rope_scale: Tuple[float, float, float] = (2.0, 1.0, 1.0),
	extra_pos_embed_type: str \| None = None,
	img_context_dim_in: int \| None = None,
	img_context_dim_out: int = 2048,
	use_crossattn_projection: bool = False,
	crossattn_proj_in_channels: int = 1024,
	encoder_hidden_states_channels: int = 1024,
	):
	super().__init__()

	self.patch_embed = CosmosPatchEmbed(in_channels, model_channels, patch_size, bias=False)

	self.patch_embed_base = CosmosPatchEmbed(latent_channels, model_channels, patch_size, bias=False)
	self.time_embed = CosmosEmbedding(model_channels, model_channels)

	self.learnable_pos_embed = None
	if extra_pos_embed_type == "learnable":
	self.learnable_pos_embed = CosmosLearnablePositionalEmbed(
	hidden_size=model_channels,
	max_size=max_size,
	patch_size=patch_size,
	)

	self.img_context_proj = None
	if img_context_dim_in is not None and img_context_dim_in > 0:
	self.img_context_proj = nn.Sequential(
	nn.Linear(img_context_dim_in, img_context_dim_out, bias=True),
	nn.GELU(),
	)

	# Cross-attention projection for text embeddings (same as transformer)
	self.crossattn_proj = None
	if use_crossattn_projection:
	self.crossattn_proj = nn.Sequential(
	nn.Linear(crossattn_proj_in_channels, encoder_hidden_states_channels, bias=True),
	nn.GELU(),
	)

	# RoPE for both control and base latents
	self.rope = CosmosRotaryPosEmbed(
	hidden_size=attention_head_dim, max_size=max_size, patch_size=patch_size, rope_scale=rope_scale
	)

	self.control_blocks = nn.ModuleList(
	[
	CosmosTransformerBlock(
	num_attention_heads=num_attention_heads,
	attention_head_dim=attention_head_dim,
	cross_attention_dim=text_embed_dim,
	mlp_ratio=mlp_ratio,
	adaln_lora_dim=adaln_lora_dim,
	qk_norm="rms_norm",
	out_bias=False,
	img_context=img_context_dim_in is not None and img_context_dim_in > 0,
	before_proj=(block_idx == 0),
	after_proj=True,
	)
	for block_idx in range(n_controlnet_blocks)
	]
	)

	self.gradient_checkpointing = False

	def _expand_conditioning_scale(self, conditioning_scale: float \| list[float]) -> List[float]:
	if isinstance(conditioning_scale, list):
	scales = conditioning_scale
	else:
	scales = [conditioning_scale] * len(self.control_blocks)

	if len(scales) < len(self.control_blocks):
	logger.warning(
	"Received %d control scales, but control network defines %d blocks. "
	"Scales will be trimmed or repeated to match.",
	len(scales),
	len(self.control_blocks),
	)
	scales = (scales * len(self.control_blocks))[: len(self.control_blocks)]
	return scales

	def forward(
	self,
	controls_latents: torch.Tensor,
	latents: torch.Tensor,
	timestep: torch.Tensor,
	encoder_hidden_states: Union[Optional[torch.Tensor], Tuple[Optional[torch.Tensor], Optional[torch.Tensor]]],
	condition_mask: torch.Tensor,
	conditioning_scale: float \| list[float] = 1.0,
	padding_mask: torch.Tensor \| None = None,
	attention_mask: torch.Tensor \| None = None,
	fps: int \| None = None,
	return_dict: bool = True,
	) -> Union[CosmosControlNetOutput, Tuple[List[torch.Tensor]]]:
	"""
	Forward pass for the ControlNet.

	Args:
	controls_latents: Control signal latents [B, C, T, H, W]
	latents: Base latents from the noising process [B, C, T, H, W]
	timestep: Diffusion timestep tensor
	encoder_hidden_states: Tuple of (text_context, img_context) or text_context
	condition_mask: Conditioning mask [B, 1, T, H, W]
	conditioning_scale: Scale factor(s) for control outputs
	padding_mask: Padding mask [B, 1, H, W] or None
	attention_mask: Optional attention mask or None
	fps: Frames per second for RoPE or None
	return_dict: Whether to return a CosmosControlNetOutput or a tuple

	Returns:
	CosmosControlNetOutput or tuple of control tensors
	"""
	B, C, T, H, W = controls_latents.shape

	# 1. Prepare control latents
	control_hidden_states = controls_latents
	vace_in_channels = self.config.in_channels - 1
	if control_hidden_states.shape[1] < vace_in_channels - 1:
	pad_C = vace_in_channels - 1 - control_hidden_states.shape[1]
	control_hidden_states = torch.cat(
	[
	control_hidden_states,
	torch.zeros(
	(B, pad_C, T, H, W), dtype=control_hidden_states.dtype, device=control_hidden_states.device
	),
	],
	dim=1,
	)

	if condition_mask is not None:
	control_hidden_states = torch.cat([control_hidden_states, condition_mask], dim=1)
	else:
	control_hidden_states = torch.cat(
	[control_hidden_states, torch.zeros_like(controls_latents[:, :1])], dim=1
	)

	padding_mask_resized = transforms.functional.resize(
	padding_mask, list(control_hidden_states.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
	)
	control_hidden_states = torch.cat(
	[control_hidden_states, padding_mask_resized.unsqueeze(2).repeat(B, 1, T, 1, 1)], dim=1
	)

	# 2. Prepare base latents (same processing as transformer.forward)
	base_hidden_states = latents
	if condition_mask is not None:
	base_hidden_states = torch.cat([base_hidden_states, condition_mask], dim=1)

	base_padding_mask = transforms.functional.resize(
	padding_mask, list(base_hidden_states.shape[-2:]), interpolation=transforms.InterpolationMode.NEAREST
	)
	base_hidden_states = torch.cat(
	[base_hidden_states, base_padding_mask.unsqueeze(2).repeat(B, 1, T, 1, 1)], dim=1
	)

	# 3. Generate positional embeddings (shared for both)
	image_rotary_emb = self.rope(control_hidden_states, fps=fps)
	extra_pos_emb = self.learnable_pos_embed(control_hidden_states) if self.learnable_pos_embed else None

	# 4. Patchify control latents
	control_hidden_states = self.patch_embed(control_hidden_states)
	control_hidden_states = control_hidden_states.flatten(1, 3)

	# 5. Patchify base latents
	p_t, p_h, p_w = self.config.patch_size
	post_patch_num_frames = T // p_t
	post_patch_height = H // p_h
	post_patch_width = W // p_w

	base_hidden_states = self.patch_embed_base(base_hidden_states)
	base_hidden_states = base_hidden_states.flatten(1, 3)

	# 6. Time embeddings
	if timestep.ndim == 1:
	temb, embedded_timestep = self.time_embed(base_hidden_states, timestep)
	elif timestep.ndim == 5:
	batch_size, _, num_frames, _, _ = latents.shape
	assert timestep.shape == (batch_size, 1, num_frames, 1, 1), (
	f"Expected timestep to have shape [B, 1, T, 1, 1], but got {timestep.shape}"
	)
	timestep_flat = timestep.flatten()
	temb, embedded_timestep = self.time_embed(base_hidden_states, timestep_flat)
	temb, embedded_timestep = (
	x.view(batch_size, post_patch_num_frames, 1, 1, -1)
	.expand(-1, -1, post_patch_height, post_patch_width, -1)
	.flatten(1, 3)
	for x in (temb, embedded_timestep)
	)
	else:
	raise ValueError(f"Expected timestep to have shape [B, 1, T, 1, 1] or [T], but got {timestep.shape}")

	# 7. Process encoder hidden states
	if isinstance(encoder_hidden_states, tuple):
	text_context, img_context = encoder_hidden_states
	else:
	text_context = encoder_hidden_states
	img_context = None

	# Apply cross-attention projection to text context
	if self.crossattn_proj is not None:
	text_context = self.crossattn_proj(text_context)

	# Apply cross-attention projection to image context (if provided)
	if img_context is not None and self.img_context_proj is not None:
	img_context = self.img_context_proj(img_context)

	# Combine text and image context into a single tuple
	if self.config.img_context_dim_in is not None and self.config.img_context_dim_in > 0:
	processed_encoder_hidden_states = (text_context, img_context)
	else:
	processed_encoder_hidden_states = text_context

	# 8. Prepare attention mask
	if attention_mask is not None:
	attention_mask = attention_mask.unsqueeze(1).unsqueeze(1) # [B, 1, 1, S]

	# 9. Run control blocks
	scales = self._expand_conditioning_scale(conditioning_scale)
	result = []
	for block_idx, (block, scale) in enumerate(zip(self.control_blocks, scales)):
	if torch.is_grad_enabled() and self.gradient_checkpointing:
	control_hidden_states, control_proj = self._gradient_checkpointing_func(
	block,
	control_hidden_states,
	processed_encoder_hidden_states,
	embedded_timestep,
	temb,
	image_rotary_emb,
	extra_pos_emb,
	attention_mask,
	None, # controlnet_residual
	base_hidden_states,
	block_idx,
	)
	else:
	control_hidden_states, control_proj = block(
	hidden_states=control_hidden_states,
	encoder_hidden_states=processed_encoder_hidden_states,
	embedded_timestep=embedded_timestep,
	temb=temb,
	image_rotary_emb=image_rotary_emb,
	extra_pos_emb=extra_pos_emb,
	attention_mask=attention_mask,
	controlnet_residual=None,
	latents=base_hidden_states,
	block_idx=block_idx,
	)
	result.append(control_proj * scale)

	if not return_dict:
	return (result,)

	return CosmosControlNetOutput(control_block_samples=result)