Upload folder using huggingface_hub

b386992 verified 9 months ago

47.6 kB

	# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	import os
	from typing import List, Optional, Union

	import numpy as np
	import torch
	from PIL import Image
	from safetensors.torch import load_file as load_safetensors
	from safetensors.torch import save_file as save_safetensors
	from torch import nn
	from tqdm import tqdm

	from nemo.collections.diffusion.encoders.conditioner import FrozenCLIPEmbedder, FrozenT5Embedder
	from nemo.collections.diffusion.models.flux.model import Flux
	from nemo.collections.diffusion.models.flux_controlnet.model import FluxControlNet, FluxControlNetConfig
	from nemo.collections.diffusion.sampler.flow_matching.flow_match_euler_discrete import FlowMatchEulerDiscreteScheduler
	from nemo.collections.diffusion.utils.flux_ckpt_converter import flux_transformer_converter
	from nemo.collections.diffusion.utils.flux_pipeline_utils import FluxModelParams
	from nemo.collections.diffusion.vae.autoencoder import AutoEncoder
	from nemo.utils import logging


	class FluxInferencePipeline(nn.Module):
	"""
	A pipeline for performing image generation with flux.

	Args:
	params (FluxModelParams, optional):
	Configuration parameters for the model pipeline, including device settings and model configurations.
	flux (Flux, optional):
	A pre-initialized Flux model used for the transformation process.
	If None, a new Flux model is created using the configuration in `params`.
	vae (AutoEncoder, optional):
	A pre-initialized VAE (Variational Autoencoder) model.
	If None, a new VAE model is created using the configuration in `params.vae_config`.
	t5 (FrozenT5Embedder, optional):
	A pre-initialized FrozenT5Embedder model.
	If None, a new T5 model is created using the configuration in `params.t5_params`.
	clip (FrozenCLIPEmbedder, optional):
	A pre-initialized FrozenCLIPEmbedder model.
	If None, a new CLIP model is created using the configuration in `params.clip_params`.
	scheduler_steps (int, optional):
	The number of scheduler steps to use for inference. Default is 1000.

	Attributes:
	device (torch.device): The device (CPU or GPU) where the models will be placed.
	vae (AutoEncoder): The VAE model used for image reconstruction or generation.
	clip_encoder (FrozenCLIPEmbedder): The CLIP encoder for processing image-text inputs.
	t5_encoder (FrozenT5Embedder): The T5 encoder for processing text inputs.
	transformer (Flux): The Flux model used for image-text joint processing.
	vae_scale_factor (float): A scale factor for the VAE, based on the number of channels in the VAE.
	scheduler (FlowMatchEulerDiscreteScheduler): Scheduler used for controlling the flow of inference steps.
	params (FluxModelParams): Configuration parameters used for model setup.

	Methods:
	load_from_pretrained:
	Loads model weights from a checkpoint.
	encoder_prompt:
	Encodes text prompts and retrieves embeddings.
	_prepare_latent_image_ids:
	Prepares latent image ids for the generation process.
	_pack_latents:
	Packs latents into the desired format for input to the model.
	_unpack_latents:
	Unpacks latents from the model into image format.
	_calculate_shift:
	Calculates the shift parameter used for controlling sequence lengths in the model.
	prepare_latents:
	Prepares the latent tensors and latent image ids for generation.
	_generate_rand_latents:
	Generates random latents using a specified generator.
	numpy_to_pil:
	Converts a numpy array or a batch of images to PIL images.
	torch_to_numpy:
	Converts a tensor of images to a numpy array.
	denormalize:
	Denormalizes the image to the range [0, 1].
	__call__:
	Runs the entire image generation process based on the input prompt, including encoding,
	latent preparation, inference, and output generation.

	Example:
	pipeline = FluxInferencePipeline(params)
	images = pipeline(
	prompt=["A beautiful sunset over a mountain range"],
	height=512,
	width=512,
	num_inference_steps=50,
	guidance_scale=7.5
	)
	"""

	def __init__(
	self,
	params: FluxModelParams = None,
	flux: Optional[Flux] = None,
	vae: Optional[AutoEncoder] = None,
	t5: Optional[FrozenT5Embedder] = None,
	clip: Optional[FrozenCLIPEmbedder] = None,
	scheduler_steps: int = 1000,
	):
	"""
	Initializes the FluxInferencePipeline with the provided models and configurations.

	Args:
	params (FluxModelParams, optional):
	Configuration parameters for the model pipeline, including device settings and model configurations.
	flux (Flux, optional):
	A pre-initialized Flux model used for the transformation process.
	If None, a new Flux model is created using the configuration in `params`.
	vae (AutoEncoder, optional):
	A pre-initialized VAE (Variational Autoencoder) model.
	If None, a new VAE model is created using the configuration in `params.vae_config`.
	t5 (FrozenT5Embedder, optional):
	A pre-initialized FrozenT5Embedder model.
	If None, a new T5 model is created using the configuration in `params.t5_params`.
	clip (FrozenCLIPEmbedder, optional):
	A pre-initialized FrozenCLIPEmbedder model.
	If None, a new CLIP model is created using the configuration in `params.clip_params`.
	scheduler_steps (int, optional): The number of scheduler steps to use for inference. Default is 1000.
	"""
	super().__init__()
	self.device = params.device
	params.clip_params.device = self.device
	params.t5_params.device = self.device

	self.vae = AutoEncoder(params.vae_config).to(self.device).eval() if vae is None else vae
	self.clip_encoder = (
	FrozenCLIPEmbedder(
	version=params.clip_params.version,
	max_length=params.clip_params.max_length,
	always_return_pooled=params.clip_params.always_return_pooled,
	device=params.clip_params.device,
	)
	if clip is None
	else clip
	)
	self.t5_encoder = (
	FrozenT5Embedder(
	params.t5_params.version,
	max_length=params.t5_params.max_length,
	device=params.t5_params.device,
	load_config_only=params.t5_params.load_config_only,
	)
	if t5 is None
	else t5
	)
	self.transformer = Flux(params.flux_config).to(self.device).eval() if flux is None else flux
	self.vae_scale_factor = 2 ** (len(self.vae.params.ch_mult))
	self.scheduler = FlowMatchEulerDiscreteScheduler(num_train_timesteps=scheduler_steps)
	self.params = params

	def load_from_pretrained(self, ckpt_path, do_convert_from_hf=True, save_converted_model_to=None):
	"""
	Loads the model's weights from a checkpoint. If HF ckpt is provided, it will be converted to NeMo
	format and save it to local folder.

	Args:
	ckpt_path (str):
	Path to the checkpoint file.
	do_convert_from_hf (bool, optional):
	Whether to convert the checkpoint from Hugging Face format before loading. Default is True.
	save_converted_model_to (str, optional):
	Path to save the converted checkpoint if `do_convert_from_hf` is True. Default is None.

	Logs:
	The function logs information about missing or unexpected keys during checkpoint loading.
	"""
	if do_convert_from_hf:
	ckpt = flux_transformer_converter(ckpt_path, self.transformer.config)
	if save_converted_model_to is not None:
	save_path = os.path.join(save_converted_model_to, 'nemo_flux_transformer.safetensors')
	save_safetensors(ckpt, save_path)
	logging.info(f'saving converted transformer checkpoint to {save_path}')
	else:
	ckpt = load_safetensors(ckpt_path)
	missing, unexpected = self.transformer.load_state_dict(ckpt, strict=False)
	missing = [k for k in missing if not k.endswith('_extra_state')]
	# These keys are mcore specific and should not affect the model performance
	if len(missing) > 0:
	logging.info(
	f"The following keys are missing during checkpoint loading, "
	f"please check the ckpt provided or the image quality may be compromised.\n {missing}"
	)
	logging.info(f"Found unexepected keys: \n {unexpected}")

	def encoder_prompt(
	self,
	prompt: Union[str, List[str]],
	num_images_per_prompt: int = 1,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
	max_sequence_length: int = 512,
	device: Optional[torch.device] = 'cuda',
	dtype: Optional[torch.dtype] = torch.float,
	):
	"""
	Encodes a text prompt (or a batch of prompts) into embeddings using both T5 and CLIP models.

	Args:
	prompt (Union[str, List[str]]):
	The text prompt(s) to be encoded. Can be a string or a list of strings.
	num_images_per_prompt (int, optional):
	The number of images to generate per prompt. Default is 1.
	prompt_embeds (torch.FloatTensor, optional):
	Precomputed prompt embeddings, if available. Default is None.
	pooled_prompt_embeds (torch.FloatTensor, optional):
	Precomputed pooled prompt embeddings, if available. Default is None.
	max_sequence_length (int, optional):
	The maximum sequence length for the text model. Default is 512.
	device (torch.device, optional):
	The device (CPU or CUDA) on which the models are placed. Default is 'cuda'.
	dtype (torch.dtype, optional):
	The data type for tensor operations. Default is `torch.float`.

	Returns:
	Tuple[torch.FloatTensor, torch.FloatTensor, torch.FloatTensor]:
	- The prompt embeddings.
	- The pooled prompt embeddings.
	- The text IDs for the prompt.

	Raises:
	ValueError: If neither `prompt` nor `prompt_embeds` are provided.
	"""
	if prompt is not None:
	batch_size = len(prompt)
	elif prompt_embeds is not None:
	batch_size = prompt_embeds.shape[0]
	else:
	raise ValueError("Either prompt or prompt_embeds must be provided.")
	if device == 'cuda' and self.t5_encoder.device != device:
	self.t5_encoder.to(device)
	if prompt_embeds is None:
	prompt_embeds = self.t5_encoder(prompt, max_sequence_length=max_sequence_length)
	seq_len = prompt_embeds.shape[1]
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1).to(dtype=dtype)

	if device == 'cuda' and self.clip_encoder.device != device:
	self.clip_encoder.to(device)
	if pooled_prompt_embeds is None:
	_, pooled_prompt_embeds = self.clip_encoder(prompt)

	pooled_prompt_embeds = pooled_prompt_embeds.repeat(1, num_images_per_prompt, 1)
	pooled_prompt_embeds = pooled_prompt_embeds.view(batch_size * num_images_per_prompt, -1).to(dtype=dtype)

	dtype = dtype if dtype is not None else self.t5_encoder.dtype
	text_ids = torch.zeros(batch_size, prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
	text_ids = text_ids.repeat(num_images_per_prompt, 1, 1)

	return prompt_embeds.transpose(0, 1), pooled_prompt_embeds, text_ids

	@staticmethod
	def _prepare_latent_image_ids(batch_size: int, height: int, width: int, device: torch.device, dtype: torch.dtype):
	"""
	Prepares latent image IDs for input into the model. These IDs represent the image grid.

	Args:
	batch_size (int): The number of samples in the batch.
	height (int): The height of the image.
	width (int): The width of the image.
	device (torch.device): The device to place the tensor.
	dtype (torch.dtype): The data type for the tensor.

	Returns:
	torch.FloatTensor: A tensor representing the latent image IDs.
	"""
	latent_image_ids = torch.zeros(height // 2, width // 2, 3)
	latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
	latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]

	latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape

	latent_image_ids = latent_image_ids[None, :].repeat(batch_size, 1, 1, 1)
	latent_image_ids = latent_image_ids.reshape(
	batch_size, latent_image_id_height * latent_image_id_width, latent_image_id_channels
	)

	return latent_image_ids.to(device=device, dtype=dtype)

	@staticmethod
	def _pack_latents(latents, batch_size, num_channels_latents, height, width):
	"""
	Packs latents into desired shape, e.g. (B, C, H, W) --> (B, (H//2)(W//2), C 4).

	Args:
	latents (torch.Tensor): The latents to be packed.
	batch_size (int): The number of samples in the batch.
	num_channels_latents (int): The number of channels in the latents.
	height (int): The height of the image.
	width (int): The width of the image.

	Returns:
	torch.Tensor: The packed latents.
	"""
	latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
	latents = latents.permute(0, 2, 4, 1, 3, 5)
	latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)

	return latents

	@staticmethod
	def _unpack_latents(latents, height, width, vae_scale_factor):
	"""
	Unpacks the latents from the model output into an image format suitable for further processing.

	The method reshapes and permutes the latents, adjusting their dimensions according to the
	specified `vae_scale_factor` to match the expected resolution of the image.

	Args:
	latents (torch.Tensor): The latents output from the model, typically in a compact, compressed format.
	height (int): The original height of the image before scaling, used to adjust the latent dimensions.
	width (int): The original width of the image before scaling, used to adjust the latent dimensions.
	vae_scale_factor (int): A scale factor used to adjust the resolution of the image when unpacking.
	This factor istypically the inverse of the VAE downsampling factor.

	Returns:
	torch.Tensor: The unpacked latents reshaped to match the expected dimensions for image reconstruction.
	The output tensor will have shape `(batch_size, channels, height * 2, width * 2)`.

	Notes:
	- This function is intended to convert latents back into a format
	that can be decoded into images by the VAE.
	"""
	batch_size, num_patches, channels = latents.shape

	height = height // vae_scale_factor
	width = width // vae_scale_factor

	latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
	latents = latents.permute(0, 3, 1, 4, 2, 5)

	latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)

	return latents

	@staticmethod
	def _calculate_shift(
	image_seq_len,
	base_seq_len: int = 256,
	max_seq_len: int = 4096,
	base_shift: float = 0.5,
	max_shift: float = 1.16,
	):
	# pylint: disable=C0116
	m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
	b = base_shift - m * base_seq_len
	mu = image_seq_len * m + b
	return mu

	def prepare_latents(
	self,
	batch_size,
	num_channels_latents,
	height,
	width,
	dtype,
	device,
	generator,
	latents=None,
	):
	"""
	Prepares and optionally generates image latents for use in the image generation pipeline.

	This method can either use the provided latents (if already available) or generate new random latents
	using a random generator. The generated latents are then packed and prepared for the model to process.

	Args:
	batch_size (int): The number of samples in the batch.
	num_channels_latents (int): The number of channels in the latents (e.g., depth of the latent tensor).
	height (int): The height of the image to be generated (before scaling).
	width (int): The width of the image to be generated (before scaling).
	dtype (torch.dtype): The data type to use for the latents (e.g., `torch.float32`).
	device (torch.device): The device on which the latents will reside (e.g., 'cuda').
	generator (Union[torch.Generator, List[torch.Generator]]): A random number generator or a list of
	generatorsfor generating random latents. If a list is provided, its length must match the batch size.
	latents (Optional[torch.FloatTensor]): An optional pre-existing latent tensor. If provided, it is used
	instead of generating new latents.

	Returns:
	tuple: A tuple containing:
	- latents (torch.Tensor):
	The prepared latents, with shape `(batch_size, num_channels_latents, height, width)`.
	- latent_image_ids (torch.Tensor):
	A tensor containing latent image IDs for each batch sample, used for indexing
	in the model.

	Raises:
	ValueError: If a list of generators is provided but its length does not match the batch size.

	"""
	height = 2 * int(height) // self.vae_scale_factor
	width = 2 * int(width) // self.vae_scale_factor

	shape = (batch_size, num_channels_latents, height, width)

	if latents is not None:
	latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)
	return latents.to(device=device, dtype=dtype), latent_image_ids

	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(
	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
	f" size of {batch_size}. Make sure the batch size matches the length of the generators."
	)

	latents = FluxInferencePipeline._generate_rand_latents(
	shape, generator=generator, device=device, dtype=dtype, batch_size=batch_size
	)
	latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)

	latent_image_ids = self._prepare_latent_image_ids(batch_size, height, width, device, dtype)

	return latents.transpose(0, 1), latent_image_ids

	@staticmethod
	def _generate_rand_latents(
	shape,
	generator,
	device,
	dtype,
	batch_size,
	):
	'''
	Create random latents using a random generator or a list of generators.
	'''
	if isinstance(generator, list):
	shape = (1,) + shape[1:]
	latents = [
	torch.randn(shape, generator=generator[i], device=device, dtype=dtype) for i in range(batch_size)
	]
	latents = torch.cat(latents, dim=0).to(device=device)
	else:
	latents = torch.randn(shape, generator=generator, device=device, dtype=dtype)

	return latents

	@staticmethod
	def numpy_to_pil(images):
	"""
	Convert a numpy image or a batch of images to a PIL image.
	"""
	if images.ndim == 3:
	images = images[None, ...]
	images = (images * 255).round().astype("uint8")
	pil_images = [Image.fromarray(image) for image in images]

	return pil_images

	@staticmethod
	def torch_to_numpy(images):
	'''
	Convert a torch image or a batch of images to a numpy image.
	'''
	numpy_images = images.float().cpu().permute(0, 2, 3, 1).numpy()
	return numpy_images

	@staticmethod
	def denormalize(image):
	# pylint: disable=C0116
	return (image / 2 + 0.5).clamp(0, 1)

	def __call__(
	self,
	prompt: Union[str, List[str]] = None,
	height: Optional[int] = 512,
	width: Optional[int] = 512,
	num_inference_steps: int = 28,
	timesteps: Optional[List[int]] = None,
	guidance_scale: float = 7.0,
	num_images_per_prompt: Optional[int] = 1,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
	output_type: Optional[str] = "pil",
	max_sequence_length: int = 512,
	device: torch.device = 'cuda',
	dtype: torch.dtype = torch.float32,
	save_to_disk: bool = True,
	offload: bool = False,
	output_path: str = None,
	):
	"""
	Generates images based on a given text prompt and various model parameters.
	Optionally saves the images to disk.

	This method orchestrates the process of generating images by embedding the prompt, preparing the latent
	vectors, iterating through timesteps in the diffusion process, and then decoding the latent representation
	back into an image. It supports both the generation of latent representations or final images in a desired
	output format (e.g., PIL image). The images are optionally saved to disk with a unique filename based
	on the prompt.

	Args:
	prompt (Union[str, List[str]]):
	A text prompt or a list of text prompts to guide image generation. Each prompt
	generates one or more images based on the `num_images_per_prompt`.
	height (Optional[int]):
	The height of the output image. Default is 512.
	width (Optional[int]):
	The width of the output image. Default is 512.
	num_inference_steps (int):
	The number of steps for the diffusion process. Default is 28.
	timesteps (Optional[List[int]]):
	A list of specific timesteps for the diffusion process. If not provided,
	they are automatically calculated.
	guidance_scale (float):
	The scale of the guidance signal, typically used to control the strength of prompt conditioning.
	num_images_per_prompt (Optional[int]):
	The number of images to generate per prompt. Default is 1.
	generator (Optional[Union[torch.Generator, List[torch.Generator]]]):
	A random number generator or a list of generators
	for generating latents. If a list is provided, it should match the batch size.
	latents (Optional[torch.FloatTensor]):
	Pre-existing latents to use instead of generating new ones.
	prompt_embeds (Optional[torch.FloatTensor]):
	Optionally pre-computed prompt embeddings to skip the prompt encoding step.
	pooled_prompt_embeds (Optional[torch.FloatTensor]):
	Optionally pre-computed pooled prompt embeddings.
	output_type (Optional[str]):
	The format of the output. Can be "latent" or "pil" (PIL image). Default is "pil".
	max_sequence_length (int):
	The maximum sequence length for tokenizing the prompt. Default is 512.
	device (torch.device):
	The device on which the computation should take place (e.g., 'cuda'). Default is 'cuda'.
	dtype (torch.dtype):
	The data type of the latents and model weights. Default is `torch.float32`.
	save_to_disk (bool):
	Whether or not to save the generated images to disk. Default is True.
	offload (bool):
	Whether or not to offload model components to CPU to free up GPU memory during the process.
	Default is False.

	Returns:
	Union[List[Image.Image], torch.Tensor]:
	The generated images or latents, depending on the `output_type` argument.
	If `output_type` is "pil", a list of PIL images is returned. If "latent", the latents are returned.

	Raises:
	ValueError: If neither a `prompt` nor `prompt_embeds` is provided.

	Notes:
	- The model expects a device of 'cuda'.
	The method will raise an assertion error if a different device is provided.
	- The method handles both prompt-based and pre-embedded prompt input,
	providing flexibility for different usage scenarios.
	- If `save_to_disk` is enabled, images will be saved with a filename derived from the prompt text.
	"""
	assert device == 'cuda', 'Transformer blocks in Mcore must run on cuda devices'

	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	prompt = [prompt]
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	elif prompt_embeds is not None and isinstance(prompt_embeds, torch.FloatTensor):
	batch_size = prompt_embeds.shape[0]
	else:
	raise ValueError("Either prompt or prompt_embeds must be provided.")

	# get text prompt embeddings
	prompt_embeds, pooled_prompt_embeds, text_ids = self.encoder_prompt(
	prompt=prompt,
	prompt_embeds=prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_images_per_prompt=num_images_per_prompt,
	max_sequence_length=max_sequence_length,
	device=device,
	dtype=dtype,
	)
	if offload:
	self.t5_encoder.to('cpu')
	self.clip_encoder.to('cpu')
	torch.cuda.empty_cache()

	# prepare image latents
	num_channels_latents = self.transformer.in_channels // 4
	latents, latent_image_ids = self.prepare_latents(
	batch_size * num_images_per_prompt, num_channels_latents, height, width, dtype, device, generator, latents
	)
	# prepare timesteps
	sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
	image_seq_len = latents.shape[0]

	mu = FluxInferencePipeline._calculate_shift(
	image_seq_len,
	self.scheduler.base_image_seq_len,
	self.scheduler.max_image_seq_len,
	self.scheduler.base_shift,
	self.scheduler.max_shift,
	)

	self.scheduler.set_timesteps(sigmas=sigmas, device=device, mu=mu)
	timesteps = self.scheduler.timesteps

	if device == 'cuda' and device != self.device:
	self.transformer.to(device)
	with torch.no_grad():
	for i, t in tqdm(enumerate(timesteps)):
	timestep = t.expand(latents.shape[1]).to(device=latents.device, dtype=latents.dtype)
	if self.transformer.guidance_embed:
	guidance = torch.tensor([guidance_scale], device=device).expand(latents.shape[1])
	else:
	guidance = None
	with torch.autocast(device_type='cuda', dtype=latents.dtype):
	pred = self.transformer(
	img=latents,
	txt=prompt_embeds,
	y=pooled_prompt_embeds,
	timesteps=timestep / 1000,
	img_ids=latent_image_ids,
	txt_ids=text_ids,
	guidance=guidance,
	)
	latents = self.scheduler.step(pred, t, latents)[0]
	if offload:
	self.transformer.to('cpu')
	torch.cuda.empty_cache()

	if output_type == "latent":
	return latents.transpose(0, 1)
	elif output_type == "pil":
	latents = self._unpack_latents(latents.transpose(0, 1), height, width, self.vae_scale_factor)
	if device == 'cuda' and device != self.device:
	self.vae.to(device)
	with torch.autocast(device_type='cuda', dtype=latents.dtype):
	image = self.vae.decode(latents)
	if offload:
	self.vae.to('cpu')
	torch.cuda.empty_cache()
	image = FluxInferencePipeline.denormalize(image)
	image = FluxInferencePipeline.torch_to_numpy(image)
	image = FluxInferencePipeline.numpy_to_pil(image)
	if save_to_disk:
	print('Saving to disk')
	os.makedirs(output_path, exist_ok=True)
	assert len(image) == int(len(prompt) * num_images_per_prompt)
	prompt = [p[:40] + f'_{idx}' for p in prompt for idx in range(num_images_per_prompt)]
	for file_name, image in zip(prompt, image):
	image.save(os.path.join(output_path, f'{file_name}.png'))

	return image


	class FluxControlNetInferencePipeline(FluxInferencePipeline):
	'''
	Flux Contronlnet inference pipeline initializes controlnet component in addition to a normal flux pipeline.
	'''

	def __init__(
	self,
	params: Optional[FluxModelParams] = None,
	contorlnet_config: Optional[FluxControlNetConfig] = None,
	flux: Flux = None,
	vae: AutoEncoder = None,
	t5: FrozenT5Embedder = None,
	clip: FrozenCLIPEmbedder = None,
	scheduler_steps: int = 1000,
	flux_controlnet: FluxControlNet = None,
	):
	'''
	Same as Flux Inference Pipeline with controlnet object.
	'''
	super().__init__(
	params,
	flux,
	vae,
	t5,
	clip,
	scheduler_steps,
	)
	self.flux_controlnet = FluxControlNet(contorlnet_config) if flux_controlnet is None else flux_controlnet

	def load_from_pretrained(
	self, flux_ckpt_path, controlnet_ckpt_path, do_convert_from_hf=True, save_converted_model_to=None
	):
	'''
	Converts both flux base model and flux controlnet ckpt into NeMo format.
	'''
	if do_convert_from_hf:
	flux_ckpt = flux_transformer_converter(flux_ckpt_path, self.transformer.config)
	flux_controlnet_ckpt = flux_transformer_converter(controlnet_ckpt_path, self.flux_controlnet.config)

	if save_converted_model_to is not None:
	save_path = os.path.join(save_converted_model_to, 'nemo_flux_transformer.safetensors')
	save_safetensors(flux_ckpt, save_path)
	logging.info(f'saving converted transformer checkpoint to {save_path}')
	save_path = os.path.join(save_converted_model_to, 'nemo_flux_controlnet_transformer.safetensors')
	save_safetensors(flux_controlnet_ckpt, save_path)
	logging.info(f'saving converted transformer checkpoint to {save_path}')
	else:
	flux_ckpt = load_safetensors(flux_ckpt_path)
	flux_controlnet_ckpt = load_safetensors(controlnet_ckpt_path)
	missing, unexpected = self.transformer.load_state_dict(flux_ckpt, strict=False)
	missing = [k for k in missing if not k.endswith('_extra_state')]
	# These keys are mcore specific and should not affect the model performance
	if len(missing) > 0:
	logging.info(
	f"The following keys are missing during flux checkpoint loading, "
	f"please check the ckpt provided or the image quality may be compromised.\n {missing}"
	)
	logging.info(f"Found unexepected keys: \n {unexpected}")

	missing, unexpected = self.flux_controlnet.load_state_dict(flux_controlnet_ckpt, strict=False)
	missing = [k for k in missing if not k.endswith('_extra_state')]
	# These keys are mcore specific and should not affect the model performance
	if len(missing) > 0:
	logging.info(
	f"The following keys are missing during controlnet checkpoint loading, "
	f"please check the ckpt provided or the image quality may be compromised.\n {missing}"
	)
	logging.info(f"Found unexepected keys: \n {unexpected}")

	def pil_to_numpy(self, images):
	'''
	PIL image to numpy array
	'''
	if not isinstance(images, list):
	images = [images]
	images = [np.array(image).astype(np.float32) / 255.0 for image in images]
	images = np.stack(images, axis=0)

	return images

	def numpy_to_pt(self, images: np.ndarray) -> torch.Tensor:
	'''
	Convert numpy image into torch tensors
	'''
	if images.ndim == 3:
	images = images[..., None]

	images = torch.from_numpy(images.transpose(0, 3, 1, 2))
	return images

	def prepare_image(
	self,
	images,
	height,
	width,
	batch_size,
	num_images_per_prompt,
	device,
	dtype,
	):
	'''
	Preprocess image into torch tensor, also duplicate by batch size.
	'''
	if isinstance(images, torch.Tensor):
	pass
	else:
	orig_height, orig_width = images[0].height, images[0].width
	if height != orig_height or width != orig_width:
	images = [image.resize((width, height), resample=3) for image in images]

	images = self.pil_to_numpy(images)
	images = self.numpy_to_pt(images)
	image_batch_size = images.shape[0]
	if image_batch_size == 1:
	repeat_by = batch_size
	else:
	repeat_by = num_images_per_prompt
	images = images.repeat_interleave(repeat_by, dim=0)

	images = images.to(device=device, dtype=dtype)

	return images

	def __call__(
	self,
	prompt: Union[str, List[str]] = None,
	height: Optional[int] = 512,
	width: Optional[int] = 512,
	num_inference_steps: int = 28,
	timesteps: Optional[List[int]] = None,
	guidance_scale: float = 7.0,
	num_images_per_prompt: Optional[int] = 1,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
	output_type: Optional[str] = "pil",
	max_sequence_length: int = 512,
	device: torch.device = 'cuda',
	dtype: torch.dtype = torch.float32,
	save_to_disk: bool = True,
	offload: bool = False,
	control_guidance_start: float = 0.0,
	control_guidance_end: float = 1.0,
	control_image: Union[Image.Image, torch.FloatTensor] = None,
	controlnet_conditioning_scale: Union[float, List[float]] = 1.0,
	output_path: str = None,
	):
	"""
	Generates images based on a given text prompt and optionally incorporates control images and ControlNet for
	guidance.

	This method generates images by embedding the prompt, preparing the latent vectors, iterating through timesteps
	in the diffusion process, and then decoding the latent representation back into an image. The method supports
	control images through ControlNet, where the `control_image` is used to condition the image generation.
	It also allows you to specify custom guidance scales and other parameters. Generated images can be saved
	to disk if requested.

	Args:
	prompt (Union[str, List[str]]):
	A text prompt or a list of text prompts to guide image generation. Each prompt generates one or more
	images based on the `num_images_per_prompt`.
	height (Optional[int]):
	The height of the output image. Default is 512.
	width (Optional[int]):
	The width of the output image. Default is 512.
	num_inference_steps (int):
	The number of steps for the diffusion process. Default is 28.
	timesteps (Optional[List[int]]):
	A list of specific timesteps for the diffusion process. If not provided, they are automatically
	calculated.
	guidance_scale (float):
	The scale of the guidance signal, typically used to control the strength of prompt conditioning.
	num_images_per_prompt (Optional[int]):
	The number of images to generate per prompt. Default is 1.
	generator (Optional[Union[torch.Generator, List[torch.Generator]]]):
	A random number generator or a list of generators for generating latents. If a list is provided,
	it should match the batch size.
	latents (Optional[torch.FloatTensor]):
	Pre-existing latents to use instead of generating new ones.
	prompt_embeds (Optional[torch.FloatTensor]):
	Optionally pre-computed prompt embeddings to skip the prompt encoding step.
	pooled_prompt_embeds (Optional[torch.FloatTensor]):
	Optionally pre-computed pooled prompt embeddings.
	output_type (Optional[str]):
	The format of the output. Can be "latent" or "pil" (PIL image). Default is "pil".
	max_sequence_length (int):
	The maximum sequence length for tokenizing the prompt. Default is 512.
	device (torch.device):
	The device on which the computation should take place (e.g., 'cuda'). Default is 'cuda'.
	dtype (torch.dtype):
	The data type of the latents and model weights. Default is `torch.float32`.
	save_to_disk (bool):
	Whether or not to save the generated images to disk. Default is True.
	offload (bool):
	Whether or not to offload model components to CPU to free up GPU memory during the process.
	Default is False.
	control_guidance_start (float):
	The start point for control guidance to apply during the diffusion process.
	control_guidance_end (float):
	The end point for control guidance to apply during the diffusion process.
	control_image (Union[Image.Image, torch.FloatTensor]):
	The image used for conditioning the generation process via ControlNet.
	controlnet_conditioning_scale (Union[float, List[float]]):
	Scaling factors to control the impact of the control image in the generation process.
	Can be a single value or a list for multiple images.

	Returns:
	Union[List[Image.Image], torch.Tensor]:
	The generated images or latents, depending on the `output_type` argument.
	If `output_type` is "pil", a list of PIL images is returned. If "latent", the latents are returned.

	Raises:
	ValueError: If neither a `prompt` nor `prompt_embeds` is provided.

	Notes:
	- The model expects a device of 'cuda'.
	The method will raise an assertion error if a different device is provided.
	- The method supports conditional image generation using ControlNet, where a `control_image` can guide the
	generation process.
	- If `save_to_disk` is enabled, images will be saved with a filename derived from the prompt text.
	"""
	assert device == 'cuda', 'Transformer blocks in Mcore must run on cuda devices'

	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	prompt = [prompt]
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	elif prompt_embeds is not None and isinstance(prompt_embeds, torch.FloatTensor):
	batch_size = prompt_embeds.shape[0]
	else:
	raise ValueError("Either prompt or prompt_embeds must be provided.")

	# get text prompt embeddings
	prompt_embeds, pooled_prompt_embeds, text_ids = self.encoder_prompt(
	prompt=prompt,
	prompt_embeds=prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	num_images_per_prompt=num_images_per_prompt,
	max_sequence_length=max_sequence_length,
	device=device,
	dtype=dtype,
	)
	if offload:
	self.t5_encoder.to('cpu')
	self.clip_encoder.to('cpu')
	torch.cuda.empty_cache()

	# prepare image latents
	num_channels_latents = self.transformer.in_channels // 4
	latents, latent_image_ids = self.prepare_latents(
	batch_size * num_images_per_prompt, num_channels_latents, height, width, dtype, device, generator, latents
	)

	# prepare timesteps
	sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
	image_seq_len = latents.shape[0]

	mu = FluxInferencePipeline._calculate_shift(
	image_seq_len,
	self.scheduler.base_image_seq_len,
	self.scheduler.max_image_seq_len,
	self.scheduler.base_shift,
	self.scheduler.max_shift,
	)

	self.scheduler.set_timesteps(sigmas=sigmas, device=device, mu=mu)
	timesteps = self.scheduler.timesteps

	control_image = self.prepare_image(
	images=control_image,
	height=height,
	width=width,
	batch_size=batch_size * num_images_per_prompt,
	num_images_per_prompt=num_images_per_prompt,
	device=device,
	dtype=torch.float32,
	)

	height, width = control_image.shape[-2:]
	if self.flux_controlnet.input_hint_block is None:
	if device == 'cuda' and self.device != device:
	self.vae.to(device)
	with torch.no_grad():
	control_image = self.vae.encode(control_image).to(dtype=dtype)

	height_control_image, width_control_image = control_image.shape[2:]
	control_image = self._pack_latents(
	control_image,
	batch_size * num_images_per_prompt,
	num_channels_latents,
	height_control_image,
	width_control_image,
	).transpose(0, 1)

	controlnet_keep = []
	for i in range(len(timesteps)):
	controlnet_keep.append(
	1.0
	- float(i / len(timesteps) < control_guidance_start or (i + 1) / len(timesteps) > control_guidance_end)
	)
	if device == 'cuda' and device != self.device:
	self.transformer.to(device)
	self.flux_controlnet.to(device)
	with torch.no_grad():
	for i, t in tqdm(enumerate(timesteps)):
	timestep = t.expand(latents.shape[1]).to(device=latents.device, dtype=latents.dtype)
	if self.transformer.guidance_embed:
	guidance = torch.tensor([guidance_scale], device=device).expand(latents.shape[1])
	else:
	guidance = None

	conditioning_scale = controlnet_keep[i] * controlnet_conditioning_scale

	with torch.autocast(device_type='cuda', dtype=latents.dtype):
	controlnet_double_block_samples, controlnet_single_block_samples = self.flux_controlnet(
	img=latents,
	controlnet_cond=control_image,
	txt=prompt_embeds,
	y=pooled_prompt_embeds,
	timesteps=timestep / 1000,
	img_ids=latent_image_ids,
	txt_ids=text_ids,
	guidance=guidance,
	conditioning_scale=conditioning_scale,
	)
	pred = self.transformer(
	img=latents,
	txt=prompt_embeds,
	y=pooled_prompt_embeds,
	timesteps=timestep / 1000,
	img_ids=latent_image_ids,
	txt_ids=text_ids,
	guidance=guidance,
	controlnet_double_block_samples=controlnet_double_block_samples,
	controlnet_single_block_samples=controlnet_single_block_samples,
	)
	latents = self.scheduler.step(pred, t, latents)[0]
	if offload:
	self.transformer.to('cpu')
	torch.cuda.empty_cache()

	if output_type == "latent":
	return latents.transpose(0, 1)
	elif output_type == "pil":
	latents = self._unpack_latents(latents.transpose(0, 1), height, width, self.vae_scale_factor)
	if device == 'cuda' and device != self.device:
	self.vae.to(device)
	with torch.autocast(device_type='cuda', dtype=latents.dtype):
	image = self.vae.decode(latents)
	if offload:
	self.vae.to('cpu')
	torch.cuda.empty_cache()
	image = FluxInferencePipeline.denormalize(image)
	image = FluxInferencePipeline.torch_to_numpy(image)
	image = FluxInferencePipeline.numpy_to_pil(image)
	if save_to_disk:
	print('Saving to disk')
	os.makedirs(output_path, exist_ok=True)
	assert len(image) == int(len(prompt) * num_images_per_prompt)
	prompt = [p[:40] + f'_{idx}' for p in prompt for idx in range(num_images_per_prompt)]
	for file_name, image in zip(prompt, image):
	image.save(os.path.join(output_path, f'{file_name}.png'))

	return image