Update src/pipeline.py

aa82b84 verified about 1 year ago

39 kB

	from diffusers import (
	DiffusionPipeline,
	AutoencoderKL,
	FluxPipeline,
	FluxTransformer2DModel
	)
	from diffusers.image_processor import VaeImageProcessor
	from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
	from huggingface_hub.constants import HF_HUB_CACHE
	from transformers import (
	T5EncoderModel,
	T5TokenizerFast,
	CLIPTokenizer,
	CLIPTextModel
	)
	import torch
	import torch._dynamo
	import gc
	from PIL import Image
	from pipelines.models import TextToImageRequest
	from torch import Generator
	import time
	import math
	from typing import Type, Dict, Any, Tuple, Callable, Optional, Union
	import numpy as np
	import torch.nn as nn
	import torch.nn.functional as F
	from torchao.quantization import quantize_, float8_weight_only, int8_dynamic_activation_int4_weight


	import inspect
	from typing import Any, Callable, Dict, List, Optional, Union
	import numpy as np
	import torch
	from transformers import (
	CLIPImageProcessor,
	CLIPTextModel,
	CLIPTokenizer,
	CLIPVisionModelWithProjection,
	T5EncoderModel,
	T5TokenizerFast,
	)
	from diffusers.image_processor import PipelineImageInput, VaeImageProcessor
	from diffusers.loaders import FluxIPAdapterMixin, FluxLoraLoaderMixin, FromSingleFileMixin, TextualInversionLoaderMixin
	from diffusers.models import AutoencoderKL, FluxTransformer2DModel
	from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
	from diffusers.utils import (
	USE_PEFT_BACKEND,
	is_torch_xla_available,
	logging,
	replace_example_docstring,
	scale_lora_layers,
	unscale_lora_layers,
	)
	from diffusers.utils.torch_utils import randn_tensor
	from diffusers.pipelines.pipeline_utils import DiffusionPipeline
	from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput

	import torch.utils.benchmark as benchmark
	# preconfigs
	import os
	os.environ['PYTORCH_CUDA_ALLOC_CONF']="expandable_segments:True"
	os.environ["TOKENIZERS_PARALLELISM"] = "True"
	torch._dynamo.config.suppress_errors = True
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.enabled = True
	# torch.backends.cudnn.benchmark = True

	# globals
	Pipeline = None
	ckpt_id = "manbeast3b/flux.1-schnell-full1"
	ckpt_revision = "cb1b599b0d712b9aab2c4df3ad27b050a27ec146"




	logger = logging.get_logger(__name__) # pylint: disable=invalid-name

	def calculate_shift(
	image_seq_len,
	base_seq_len: int = 256,
	max_seq_len: int = 4096,
	base_shift: float = 0.5,
	max_shift: float = 1.16,
	):
	m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
	b = base_shift - m * base_seq_len
	mu = image_seq_len * m + b
	return mu

	def retrieve_timesteps(
	scheduler,
	num_inference_steps: Optional[int] = None,
	device: Optional[Union[str, torch.device]] = None,
	timesteps: Optional[List[int]] = None,
	sigmas: Optional[List[float]] = None,
	**kwargs,
	):
	if timesteps is not None and sigmas is not None:
	raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
	if timesteps is not None:
	accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
	if not accepts_timesteps:
	raise ValueError(
	f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
	f" timestep schedules. Please check whether you are using the correct scheduler."
	)
	scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
	timesteps = scheduler.timesteps
	num_inference_steps = len(timesteps)
	elif sigmas is not None:
	accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
	if not accept_sigmas:
	raise ValueError(
	f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
	f" sigmas schedules. Please check whether you are using the correct scheduler."
	)
	scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
	timesteps = scheduler.timesteps
	num_inference_steps = len(timesteps)
	else:
	scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
	timesteps = scheduler.timesteps
	return timesteps, num_inference_steps


	class FluxPipeline(
	DiffusionPipeline,
	FluxLoraLoaderMixin,
	FromSingleFileMixin,
	TextualInversionLoaderMixin,
	FluxIPAdapterMixin,
	):
	model_cpu_offload_seq = "text_encoder->text_encoder_2->image_encoder->transformer->vae"
	_optional_components = ["image_encoder", "feature_extractor"]
	_callback_tensor_inputs = ["latents", "prompt_embeds"]

	def __init__(
	self,
	scheduler: FlowMatchEulerDiscreteScheduler,
	vae: AutoencoderKL,
	text_encoder: CLIPTextModel,
	tokenizer: CLIPTokenizer,
	text_encoder_2: T5EncoderModel,
	tokenizer_2: T5TokenizerFast,
	transformer: FluxTransformer2DModel,
	image_encoder: CLIPVisionModelWithProjection = None,
	feature_extractor: CLIPImageProcessor = None,
	):
	super().__init__()

	self.register_modules(
	vae=vae,
	text_encoder=text_encoder,
	text_encoder_2=text_encoder_2,
	tokenizer=tokenizer,
	tokenizer_2=tokenizer_2,
	transformer=transformer,
	scheduler=scheduler,
	image_encoder=image_encoder,
	feature_extractor=feature_extractor,
	)
	self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) if getattr(self, "vae", None) else 8
	# Flux latents are turned into 2x2 patches and packed. This means the latent width and height has to be divisible
	# by the patch size. So the vae scale factor is multiplied by the patch size to account for this
	self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor * 2)
	self.tokenizer_max_length = (
	self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
	)
	self.default_sample_size = 128

	def _get_t5_prompt_embeds(
	self,
	prompt: Union[str, List[str]] = None,
	num_images_per_prompt: int = 1,
	max_sequence_length: int = 512,
	device: Optional[torch.device] = None,
	dtype: Optional[torch.dtype] = None,
	):
	device = device or self._execution_device
	dtype = dtype or self.text_encoder.dtype

	prompt = [prompt] if isinstance(prompt, str) else prompt
	batch_size = len(prompt)

	if isinstance(self, TextualInversionLoaderMixin):
	prompt = self.maybe_convert_prompt(prompt, self.tokenizer_2)

	text_inputs = self.tokenizer_2(
	prompt,
	padding="max_length",
	max_length=max_sequence_length,
	truncation=True,
	return_length=False,
	return_overflowing_tokens=False,
	return_tensors="pt",
	)
	text_input_ids = text_inputs.input_ids
	untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids

	if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
	removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
	logger.warning(
	"The following part of your input was truncated because `max_sequence_length` is set to "
	f" {max_sequence_length} tokens: {removed_text}"
	)

	prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]

	dtype = self.text_encoder_2.dtype
	prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)

	_, seq_len, _ = prompt_embeds.shape

	# duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
	prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)

	return prompt_embeds

	def _get_clip_prompt_embeds(
	self,
	prompt: Union[str, List[str]],
	num_images_per_prompt: int = 1,
	device: Optional[torch.device] = None,
	):
	device = device or self._execution_device

	prompt = [prompt] if isinstance(prompt, str) else prompt
	batch_size = len(prompt)

	if isinstance(self, TextualInversionLoaderMixin):
	prompt = self.maybe_convert_prompt(prompt, self.tokenizer)

	text_inputs = self.tokenizer(
	prompt,
	padding="max_length",
	max_length=self.tokenizer_max_length,
	truncation=True,
	return_overflowing_tokens=False,
	return_length=False,
	return_tensors="pt",
	)

	text_input_ids = text_inputs.input_ids
	untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
	if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
	removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1 : -1])
	logger.warning(
	"The following part of your input was truncated because CLIP can only handle sequences up to"
	f" {self.tokenizer_max_length} tokens: {removed_text}"
	)
	prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)

	# Use pooled output of CLIPTextModel
	prompt_embeds = prompt_embeds.pooler_output
	prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)

	# duplicate text embeddings for each generation per prompt, using mps friendly method
	prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
	prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)

	return prompt_embeds

	def encode_prompt(
	self,
	prompt: Union[str, List[str]],
	prompt_2: Union[str, List[str]],
	device: Optional[torch.device] = None,
	num_images_per_prompt: int = 1,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
	max_sequence_length: int = 512,
	lora_scale: Optional[float] = None,
	):
	device = device or self._execution_device

	# set lora scale so that monkey patched LoRA
	# function of text encoder can correctly access it
	if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
	self._lora_scale = lora_scale

	# dynamically adjust the LoRA scale
	if self.text_encoder is not None and USE_PEFT_BACKEND:
	scale_lora_layers(self.text_encoder, lora_scale)
	if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
	scale_lora_layers(self.text_encoder_2, lora_scale)

	prompt = [prompt] if isinstance(prompt, str) else prompt

	if prompt_embeds is None:
	prompt_2 = prompt_2 or prompt
	prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2

	# We only use the pooled prompt output from the CLIPTextModel
	pooled_prompt_embeds = self._get_clip_prompt_embeds(
	prompt=prompt,
	device=device,
	num_images_per_prompt=num_images_per_prompt,
	)
	prompt_embeds = self._get_t5_prompt_embeds(
	prompt=prompt_2,
	num_images_per_prompt=num_images_per_prompt,
	max_sequence_length=max_sequence_length,
	device=device,
	)

	if self.text_encoder is not None:
	if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
	# Retrieve the original scale by scaling back the LoRA layers
	unscale_lora_layers(self.text_encoder, lora_scale)

	if self.text_encoder_2 is not None:
	if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
	# Retrieve the original scale by scaling back the LoRA layers
	unscale_lora_layers(self.text_encoder_2, lora_scale)

	dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
	text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)

	return prompt_embeds, pooled_prompt_embeds, text_ids

	def encode_image(self, image, device, num_images_per_prompt):
	dtype = next(self.image_encoder.parameters()).dtype

	if not isinstance(image, torch.Tensor):
	image = self.feature_extractor(image, return_tensors="pt").pixel_values

	image = image.to(device=device, dtype=dtype)
	image_embeds = self.image_encoder(image).image_embeds
	image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0)
	return image_embeds

	def prepare_ip_adapter_image_embeds(
	self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt
	):
	image_embeds = []
	if ip_adapter_image_embeds is None:
	if not isinstance(ip_adapter_image, list):
	ip_adapter_image = [ip_adapter_image]

	if len(ip_adapter_image) != len(self.transformer.encoder_hid_proj.image_projection_layers):
	raise ValueError(
	f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.transformer.encoder_hid_proj.image_projection_layers)} IP Adapters."
	)

	for single_ip_adapter_image, image_proj_layer in zip(
	ip_adapter_image, self.transformer.encoder_hid_proj.image_projection_layers
	):
	single_image_embeds = self.encode_image(single_ip_adapter_image, device, 1)

	image_embeds.append(single_image_embeds[None, :])
	else:
	for single_image_embeds in ip_adapter_image_embeds:
	image_embeds.append(single_image_embeds)

	ip_adapter_image_embeds = []
	for i, single_image_embeds in enumerate(image_embeds):
	single_image_embeds = torch.cat([single_image_embeds] * num_images_per_prompt, dim=0)
	single_image_embeds = single_image_embeds.to(device=device)
	ip_adapter_image_embeds.append(single_image_embeds)

	return ip_adapter_image_embeds

	def check_inputs(
	self,
	prompt,
	prompt_2,
	height,
	width,
	negative_prompt=None,
	negative_prompt_2=None,
	prompt_embeds=None,
	negative_prompt_embeds=None,
	pooled_prompt_embeds=None,
	negative_pooled_prompt_embeds=None,
	callback_on_step_end_tensor_inputs=None,
	max_sequence_length=None,
	):
	if height % (self.vae_scale_factor * 2) != 0 or width % (self.vae_scale_factor * 2) != 0:
	logger.warning(
	f"`height` and `width` have to be divisible by {self.vae_scale_factor * 2} but are {height} and {width}. Dimensions will be resized accordingly"
	)

	if callback_on_step_end_tensor_inputs is not None and not all(
	k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
	):
	raise ValueError(
	f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
	)

	if prompt is not None and prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
	" only forward one of the two."
	)
	elif prompt_2 is not None and prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
	" only forward one of the two."
	)
	elif prompt is None and prompt_embeds is None:
	raise ValueError(
	"Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
	)
	elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
	raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
	elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
	raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")

	if negative_prompt is not None and negative_prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:"
	f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
	)
	elif negative_prompt_2 is not None and negative_prompt_embeds is not None:
	raise ValueError(
	f"Cannot forward both `negative_prompt_2`: {negative_prompt_2} and `negative_prompt_embeds`:"
	f" {negative_prompt_embeds}. Please make sure to only forward one of the two."
	)

	if prompt_embeds is not None and negative_prompt_embeds is not None:
	if prompt_embeds.shape != negative_prompt_embeds.shape:
	raise ValueError(
	"`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but"
	f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`"
	f" {negative_prompt_embeds.shape}."
	)

	if prompt_embeds is not None and pooled_prompt_embeds is None:
	raise ValueError(
	"If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
	)
	if negative_prompt_embeds is not None and negative_pooled_prompt_embeds is None:
	raise ValueError(
	"If `negative_prompt_embeds` are provided, `negative_pooled_prompt_embeds` also have to be passed. Make sure to generate `negative_pooled_prompt_embeds` from the same text encoder that was used to generate `negative_prompt_embeds`."
	)

	if max_sequence_length is not None and max_sequence_length > 512:
	raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")

	@staticmethod
	def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
	latent_image_ids = torch.zeros(height, width, 3)
	latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height)[:, None]
	latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width)[None, :]

	latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape

	latent_image_ids = latent_image_ids.reshape(
	latent_image_id_height * latent_image_id_width, latent_image_id_channels
	)

	return latent_image_ids.to(device=device, dtype=dtype)

	@staticmethod
	def _pack_latents(latents, batch_size, num_channels_latents, height, width):
	latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
	latents = latents.permute(0, 2, 4, 1, 3, 5)
	latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)

	return latents

	@staticmethod
	def _unpack_latents(latents, height, width, vae_scale_factor):
	batch_size, num_patches, channels = latents.shape

	# VAE applies 8x compression on images but we must also account for packing which requires
	# latent height and width to be divisible by 2.
	height = 2 * (int(height) // (vae_scale_factor * 2))
	width = 2 * (int(width) // (vae_scale_factor * 2))

	latents = latents.view(batch_size, height // 2, width // 2, channels // 4, 2, 2)
	latents = latents.permute(0, 3, 1, 4, 2, 5)

	latents = latents.reshape(batch_size, channels // (2 * 2), height, width)

	return latents

	def enable_vae_slicing(self):
	self.vae.enable_slicing()

	def disable_vae_slicing(self):
	self.vae.disable_slicing()

	def enable_vae_tiling(self):
	self.vae.enable_tiling()

	def disable_vae_tiling(self):
	self.vae.disable_tiling()

	def prepare_latents(
	self,
	batch_size,
	num_channels_latents,
	height,
	width,
	dtype,
	device,
	generator,
	latents=None,
	):
	# VAE applies 8x compression on images but we must also account for packing which requires
	# latent height and width to be divisible by 2.
	height = 2 * (int(height) // (self.vae_scale_factor * 2))
	width = 2 * (int(width) // (self.vae_scale_factor * 2))

	shape = (batch_size, num_channels_latents, height, width)

	if latents is not None:
	latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)
	return latents.to(device=device, dtype=dtype), latent_image_ids

	if isinstance(generator, list) and len(generator) != batch_size:
	raise ValueError(
	f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
	f" size of {batch_size}. Make sure the batch size matches the length of the generators."
	)

	latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
	latents = self._pack_latents(latents, batch_size, num_channels_latents, height, width)

	latent_image_ids = self._prepare_latent_image_ids(batch_size, height // 2, width // 2, device, dtype)

	return latents, latent_image_ids

	@property
	def guidance_scale(self):
	return self._guidance_scale

	@property
	def joint_attention_kwargs(self):
	return self._joint_attention_kwargs

	@property
	def num_timesteps(self):
	return self._num_timesteps

	@property
	def current_timestep(self):
	return self._current_timestep

	@property
	def interrupt(self):
	return self._interrupt

	@torch.no_grad()
	def __call__(
	self,
	prompt: Union[str, List[str]] = None,
	prompt_2: Optional[Union[str, List[str]]] = None,
	negative_prompt: Union[str, List[str]] = None,
	negative_prompt_2: Optional[Union[str, List[str]]] = None,
	true_cfg_scale: float = 1.0,
	height: Optional[int] = None,
	width: Optional[int] = None,
	num_inference_steps: int = 28,
	sigmas: Optional[List[float]] = None,
	guidance_scale: float = 3.5,
	num_images_per_prompt: Optional[int] = 1,
	generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
	latents: Optional[torch.FloatTensor] = None,
	prompt_embeds: Optional[torch.FloatTensor] = None,
	pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
	ip_adapter_image: Optional[PipelineImageInput] = None,
	ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
	negative_ip_adapter_image: Optional[PipelineImageInput] = None,
	negative_ip_adapter_image_embeds: Optional[List[torch.Tensor]] = None,
	negative_prompt_embeds: Optional[torch.FloatTensor] = None,
	negative_pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
	output_type: Optional[str] = "pil",
	return_dict: bool = True,
	joint_attention_kwargs: Optional[Dict[str, Any]] = None,
	callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
	callback_on_step_end_tensor_inputs: List[str] = ["latents"],
	max_sequence_length: int = 512,
	):
	height = height or self.default_sample_size * self.vae_scale_factor
	width = width or self.default_sample_size * self.vae_scale_factor

	# 1. Check inputs. Raise error if not correct
	self.check_inputs(
	prompt,
	prompt_2,
	height,
	width,
	negative_prompt=negative_prompt,
	negative_prompt_2=negative_prompt_2,
	prompt_embeds=prompt_embeds,
	negative_prompt_embeds=negative_prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	negative_pooled_prompt_embeds=negative_pooled_prompt_embeds,
	callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
	max_sequence_length=max_sequence_length,
	)

	self._guidance_scale = guidance_scale
	self._joint_attention_kwargs = joint_attention_kwargs
	self._current_timestep = None
	self._interrupt = False

	# 2. Define call parameters
	if prompt is not None and isinstance(prompt, str):
	batch_size = 1
	elif prompt is not None and isinstance(prompt, list):
	batch_size = len(prompt)
	else:
	batch_size = prompt_embeds.shape[0]

	device = self._execution_device

	lora_scale = (
	self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
	)
	has_neg_prompt = negative_prompt is not None or (
	negative_prompt_embeds is not None and negative_pooled_prompt_embeds is not None
	)
	do_true_cfg = true_cfg_scale > 1 and has_neg_prompt
	(
	prompt_embeds,
	pooled_prompt_embeds,
	text_ids,
	) = self.encode_prompt(
	prompt=prompt,
	prompt_2=prompt_2,
	prompt_embeds=prompt_embeds,
	pooled_prompt_embeds=pooled_prompt_embeds,
	device=device,
	num_images_per_prompt=num_images_per_prompt,
	max_sequence_length=max_sequence_length,
	lora_scale=lora_scale,
	)
	if do_true_cfg:
	(
	negative_prompt_embeds,
	negative_pooled_prompt_embeds,
	_,
	) = self.encode_prompt(
	prompt=negative_prompt,
	prompt_2=negative_prompt_2,
	prompt_embeds=negative_prompt_embeds,
	pooled_prompt_embeds=negative_pooled_prompt_embeds,
	device=device,
	num_images_per_prompt=num_images_per_prompt,
	max_sequence_length=max_sequence_length,
	lora_scale=lora_scale,
	)

	# 4. Prepare latent variables
	num_channels_latents = 16 #self.transformer.config.in_channels // 4
	latents, latent_image_ids = self.prepare_latents(
	batch_size * num_images_per_prompt,
	num_channels_latents,
	height,
	width,
	prompt_embeds.dtype,
	device,
	generator,
	latents,
	)

	# 5. Prepare timesteps
	sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps) if sigmas is None else sigmas
	image_seq_len = latents.shape[1]
	mu = calculate_shift(
	image_seq_len,
	self.scheduler.config.get("base_image_seq_len", 256),
	self.scheduler.config.get("max_image_seq_len", 4096),
	self.scheduler.config.get("base_shift", 0.5),
	self.scheduler.config.get("max_shift", 1.16),
	)
	timesteps, num_inference_steps = retrieve_timesteps(
	self.scheduler,
	num_inference_steps,
	device,
	sigmas=sigmas,
	mu=mu,
	)
	num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
	self._num_timesteps = len(timesteps)

	# handle guidance
	if False: #self.transformer.config.guidance_embeds:
	guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
	guidance = guidance.expand(latents.shape[0])
	else:
	guidance = None

	if (ip_adapter_image is not None or ip_adapter_image_embeds is not None) and (
	negative_ip_adapter_image is None and negative_ip_adapter_image_embeds is None
	):
	negative_ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)
	elif (ip_adapter_image is None and ip_adapter_image_embeds is None) and (
	negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None
	):
	ip_adapter_image = np.zeros((width, height, 3), dtype=np.uint8)

	if self.joint_attention_kwargs is None:
	self._joint_attention_kwargs = {}

	image_embeds = None
	negative_image_embeds = None
	if ip_adapter_image is not None or ip_adapter_image_embeds is not None:
	image_embeds = self.prepare_ip_adapter_image_embeds(
	ip_adapter_image,
	ip_adapter_image_embeds,
	device,
	batch_size * num_images_per_prompt,
	)
	if negative_ip_adapter_image is not None or negative_ip_adapter_image_embeds is not None:
	negative_image_embeds = self.prepare_ip_adapter_image_embeds(
	negative_ip_adapter_image,
	negative_ip_adapter_image_embeds,
	device,
	batch_size * num_images_per_prompt,
	)

	# 6. Denoising loop
	with self.progress_bar(total=num_inference_steps) as progress_bar:
	for i, t in enumerate(timesteps):
	if self.interrupt:
	continue

	self._current_timestep = t
	if image_embeds is not None:
	self._joint_attention_kwargs["ip_adapter_image_embeds"] = image_embeds
	# broadcast to batch dimension in a way that's compatible with ONNX/Core ML
	timestep = t.expand(latents.shape[0]).to(latents.dtype)
	# print("=============== printing all the shapes right now ======================")
	# print(latents.shape)
	# print(timestep)
	# print(guidance)
	# print(pooled_prompt_embeds.shape)
	# print(prompt_embeds.shape)
	# print(text_ids.shape)
	# print(latent_image_ids.shape)
	# print("=================== thats all folks for now ============================")
	# exit()
	noise_pred = self.transformer(
	hidden_states=latents,
	timestep=timestep / 1000,
	guidance=guidance,
	pooled_projections=pooled_prompt_embeds,
	encoder_hidden_states=prompt_embeds,
	txt_ids=text_ids,
	img_ids=latent_image_ids,
	joint_attention_kwargs=self.joint_attention_kwargs,
	return_dict=False,
	)[0]

	if do_true_cfg:
	if negative_image_embeds is not None:
	self._joint_attention_kwargs["ip_adapter_image_embeds"] = negative_image_embeds
	neg_noise_pred = self.transformer(
	hidden_states=latents,
	timestep=timestep / 1000,
	guidance=guidance,
	pooled_projections=negative_pooled_prompt_embeds,
	encoder_hidden_states=negative_prompt_embeds,
	txt_ids=text_ids,
	img_ids=latent_image_ids,
	joint_attention_kwargs=self.joint_attention_kwargs,
	return_dict=False,
	)[0]
	noise_pred = neg_noise_pred + true_cfg_scale * (noise_pred - neg_noise_pred)

	# compute the previous noisy sample x_t -> x_t-1
	latents_dtype = latents.dtype
	latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]

	if latents.dtype != latents_dtype:
	if torch.backends.mps.is_available():
	# some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
	latents = latents.to(latents_dtype)

	if callback_on_step_end is not None:
	callback_kwargs = {}
	for k in callback_on_step_end_tensor_inputs:
	callback_kwargs[k] = locals()[k]
	callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)

	latents = callback_outputs.pop("latents", latents)
	prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)

	# call the callback, if provided
	if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
	progress_bar.update()


	self._current_timestep = None

	if output_type == "latent":
	image = latents
	else:
	latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
	latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
	image = self.vae.decode(latents, return_dict=False)[0]
	image = self.image_processor.postprocess(image, output_type=output_type)

	# Offload all models
	self.maybe_free_model_hooks()

	if not return_dict:
	return (image,)

	return FluxPipelineOutput(images=image)


	@torch.no_grad()
	def f(model, **kwargs):
	return model(**kwargs)

	def prepare_latents(batch_size, height, width, num_channels_latents=1):
	vae_scale_factor = 16
	height = 2 * (int(height) // vae_scale_factor)
	width = 2 * (int(width) // vae_scale_factor)
	shape = (batch_size, num_channels_latents, height, width)
	pre_hidden_states = torch.randn(shape, dtype=torch.bfloat16, device="cuda")
	hidden_states = FluxPipeline._pack_latents(
	pre_hidden_states, batch_size, num_channels_latents, height, width
	)
	return hidden_states

	def get_example_inputs(batch_size, height, width, num_channels_latents=1):
	hidden_states = prepare_latents(batch_size, height, width, num_channels_latents)
	num_img_sequences = hidden_states.shape[1]
	example_inputs = {
	"hidden_states": hidden_states,
	"encoder_hidden_states": torch.randn(batch_size, 512, 4096, dtype=torch.bfloat16, device="cuda"),
	"pooled_projections": torch.randn(batch_size, 768, dtype=torch.bfloat16, device="cuda"),
	"timestep": torch.tensor([1.0], device="cuda").expand(batch_size),
	"img_ids": torch.randn(num_img_sequences, 3, dtype=torch.bfloat16, device="cuda"),
	"txt_ids": torch.randn(512, 3, dtype=torch.bfloat16, device="cuda"),
	"guidance": torch.tensor([3.5], device="cuda").expand(batch_size),
	"return_dict": False,
	}
	example_inputs.update({"joint_attention_kwargs": None, "return_dict": False})
	example_inputs.update({"guidance": None})
	return example_inputs

	def get_example_inputs():
	example_inputs = torch.load("/root/.cache/huggingface/hub/models--sayakpaul--flux.1-dev-int8-aot-compiled/snapshots/3b4f77e9752dd278c432870d101b958c902af2c9/serialized_inputs.pt", weights_only=True)
	example_inputs = {k: v.to("cuda") for k, v in example_inputs.items()}
	example_inputs.update({"joint_attention_kwargs": None, "return_dict": False})
	example_inputs.update({"guidance": None})
	return example_inputs

	def benchmark_fn(f, args, *kwargs):
	t0 = benchmark.Timer(
	stmt="f(args, *kwargs)",
	globals={"args": args, "kwargs": kwargs, "f": f},
	num_threads=torch.get_num_threads(),
	)
	return f"{(t0.blocked_autorange().mean):.3f}"


	def load_pipeline() -> Pipeline:
	model_name = "manbeast3b/Flux.1.Schnell-full-quant1"
	revision = "e7ddf488a4ea8a3cba05db5b8d06e7e0feb826a2"

	# hub_model_dir = os.path.join(
	# HF_HUB_CACHE,
	# f"models--{model_name.replace('/', '--')}",
	# "snapshots",
	# revision,
	# "transformer"
	# )
	# transformer = FluxTransformer2DModel.from_pretrained(
	# hub_model_dir,
	# torch_dtype=torch.bfloat16,
	# use_safetensors=False
	# ).to(memory_format=torch.channels_last)

	pipeline = FluxPipeline.from_pretrained(
	ckpt_id,
	revision=ckpt_revision,
	# text_encoder_2=text_encoder_2,
	transformer=None, #transformer,
	# vae=vae,
	torch_dtype=torch.bfloat16
	)
	# pipeline.vae = torch.compile(vae)
	pipeline.to("cuda")

	path = os.path.join(HF_HUB_CACHE, "models--manbeast3b--Flux.1.la_schnella_transformer_aot/snapshots/56fa3ac58c770179f25f2453500a5160f1423b6c/flux_la_schnell_aten.so.pt2")
	inputs1 = get_example_inputs()
	print(f"AoT pre compiled path is {path}")

	# transformer = torch._inductor.aoti_load_package(path)
	transformer = torch._inductor.aoti_load_package(path)
	print(f"{transformer(**inputs1)[0].shape=}")

	for _ in range(3):
	_ = transformer(**inputs1)[0]

	time = benchmark_fn(f, transformer, **inputs1)
	print(f"{time=} seconds.")

	pipeline.transformer = transformer

	warmup_ = "controllable varied focus thai warriors entertainment blue golden pink soft tough padthai"
	for _ in range(1):
	pipeline(
	prompt=warmup_,
	width=1024,
	height=1024,
	guidance_scale=0.0,
	num_inference_steps=4,
	max_sequence_length=256
	)
	return pipeline

	sample = 1
	@torch.no_grad()
	def infer(request: TextToImageRequest, pipeline: Pipeline, generator: Generator) -> Image:
	global sample
	if not sample:
	sample=1
	gc.collect()
	torch.cuda.empty_cache()
	torch.cuda.reset_max_memory_allocated()
	torch.cuda.reset_peak_memory_stats()

	return pipeline(request.prompt,generator=generator, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256,
	height=request.height, width=request.width, output_type="pil").images[0]