| # from diffusers import FluxPipeline, AutoencoderKL | |
| # from diffusers.image_processor import VaeImageProcessor | |
| # from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel | |
| # import torch | |
| # import gc | |
| # from PIL.Image import Image | |
| # from pipelines.models import TextToImageRequest | |
| # from torch import Generator | |
| # Pipeline = None | |
| # CHECKPOINT = "black-forest-labs/FLUX.1-schnell" | |
| # def empty_cache(): | |
| # gc.collect() | |
| # torch.cuda.empty_cache() | |
| # torch.cuda.reset_max_memory_allocated() | |
| # torch.cuda.reset_peak_memory_stats() | |
| # def load_pipeline() -> Pipeline: | |
| # infer(TextToImageRequest(prompt=""), Pipeline) | |
| # return Pipeline | |
| # def encode_prompt(prompt: str): | |
| # text_encoder = CLIPTextModel.from_pretrained( | |
| # CHECKPOINT, | |
| # subfolder="text_encoder", | |
| # torch_dtype=torch.bfloat16, | |
| # ) | |
| # text_encoder_2 = T5EncoderModel.from_pretrained( | |
| # CHECKPOINT, | |
| # subfolder="text_encoder_2", | |
| # torch_dtype=torch.bfloat16, | |
| # ) | |
| # tokenizer = CLIPTokenizer.from_pretrained(CHECKPOINT, subfolder="tokenizer") | |
| # tokenizer_2 = T5TokenizerFast.from_pretrained(CHECKPOINT, subfolder="tokenizer_2") | |
| # pipeline = FluxPipeline.from_pretrained( | |
| # CHECKPOINT, | |
| # text_encoder=text_encoder, | |
| # text_encoder_2=text_encoder_2, | |
| # tokenizer=tokenizer, | |
| # tokenizer_2=tokenizer_2, | |
| # transformer=None, | |
| # vae=None, | |
| # ).to("cuda") | |
| # with torch.no_grad(): | |
| # return pipeline.encode_prompt( | |
| # prompt=prompt, | |
| # prompt_2=None, | |
| # max_sequence_length=256, | |
| # ) | |
| # def infer_latents(prompt_embeds, pooled_prompt_embeds, width: int | None, height: int | None, seed: int | None): | |
| # pipeline = FluxPipeline.from_pretrained( | |
| # CHECKPOINT, | |
| # text_encoder=None, | |
| # text_encoder_2=None, | |
| # tokenizer=None, | |
| # tokenizer_2=None, | |
| # vae=None, | |
| # torch_dtype=torch.bfloat16, | |
| # ).to("cuda") | |
| # if seed is None: | |
| # generator = None | |
| # else: | |
| # generator = Generator(pipeline.device).manual_seed(seed) | |
| # return pipeline( | |
| # prompt_embeds=prompt_embeds, | |
| # pooled_prompt_embeds=pooled_prompt_embeds, | |
| # num_inference_steps=4, | |
| # guidance_scale=0.0, | |
| # width=width, | |
| # height=height, | |
| # generator=generator, | |
| # output_type="latent", | |
| # ).images | |
| # def infer(request: TextToImageRequest, _pipeline: Pipeline) -> Image: | |
| # empty_cache() | |
| # prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(request.prompt) | |
| # empty_cache() | |
| # latents = infer_latents(prompt_embeds, pooled_prompt_embeds, request.width, request.height, request.seed) | |
| # empty_cache() | |
| # vae = AutoencoderKL.from_pretrained( | |
| # CHECKPOINT, | |
| # subfolder="vae", | |
| # torch_dtype=torch.bfloat16, | |
| # ).to("cuda") | |
| # vae_scale_factor = 2 ** (len(vae.config.block_out_channels)) | |
| # image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor) | |
| # height = request.height or 64 * vae_scale_factor | |
| # width = request.width or 64 * vae_scale_factor | |
| # with torch.no_grad(): | |
| # latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor) | |
| # latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor | |
| # image = vae.decode(latents, return_dict=False)[0] | |
| # return image_processor.postprocess(image, output_type="pil")[0] | |
| # from diffusers import FluxPipeline, AutoencoderKL, AutoencoderTiny | |
| # from diffusers.image_processor import VaeImageProcessor | |
| # from diffusers.schedulers import FlowMatchEulerDiscreteScheduler | |
| # from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel | |
| # import torch | |
| # import torch._dynamo | |
| # import gc | |
| # from PIL import Image as img | |
| # from PIL.Image import Image | |
| # from pipelines.models import TextToImageRequest | |
| # from torch import Generator | |
| # import time | |
| # from diffusers import FluxTransformer2DModel, DiffusionPipeline | |
| # from torchao.quantization import quantize_, int8_weight_only | |
| # #from torchao.quantization import autoquant | |
| # Pipeline = None | |
| # ckpt_id = "black-forest-labs/FLUX.1-schnell" | |
| # def empty_cache(): | |
| # start = time.time() | |
| # gc.collect() | |
| # torch.cuda.empty_cache() | |
| # torch.cuda.reset_max_memory_allocated() | |
| # torch.cuda.reset_peak_memory_stats() | |
| # print(f"Flush took: {time.time() - start}") | |
| # def load_pipeline() -> Pipeline: | |
| # empty_cache() | |
| # dtype, device = torch.bfloat16, "cuda" | |
| # vae = AutoencoderKL.from_pretrained( | |
| # ckpt_id, subfolder="vae", torch_dtype=torch.bfloat16 | |
| # ) | |
| # quantize_(vae, int8_weight_only()) | |
| # pipeline = DiffusionPipeline.from_pretrained( | |
| # ckpt_id, | |
| # vae=vae, | |
| # torch_dtype=dtype, | |
| # ) | |
| # #pipeline.vae = AutoencoderTiny.from_pretrained("madebyollin/taef1", torch_dtype=torch.bfloat16) | |
| # pipeline.enable_sequential_cpu_offload() | |
| # for _ in range(2): | |
| # empty_cache() | |
| # pipeline(prompt="onomancy, aftergo, spirantic, Platyhelmia, modificator, drupaceous, jobbernowl, hereness", width=1024, height=1024, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256) | |
| # return pipeline | |
| # from datetime import datetime | |
| # @torch.inference_mode() | |
| # def infer(request: TextToImageRequest, pipeline: Pipeline) -> Image: | |
| # empty_cache() | |
| # generator = Generator("cuda").manual_seed(request.seed) | |
| # image=pipeline(request.prompt,generator=generator, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256, height=request.height, width=request.width, output_type="pil").images[0] | |
| # return(image) | |
| from diffusers import FluxPipeline, AutoencoderKL, AutoencoderTiny | |
| from diffusers.image_processor import VaeImageProcessor | |
| from diffusers.schedulers import FlowMatchEulerDiscreteScheduler | |
| from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel | |
| import torch | |
| import torch._dynamo | |
| import gc | |
| from PIL import Image as img | |
| from PIL.Image import Image | |
| from pipelines.models import TextToImageRequest | |
| from torch import Generator | |
| import time | |
| from diffusers import FluxTransformer2DModel, DiffusionPipeline | |
| from torchao.quantization import quantize_,int8_weight_only | |
| import os | |
| os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01,max_split_size_mb:512" | |
| Pipeline = None | |
| ckpt_id = "black-forest-labs/FLUX.1-schnell" | |
| def empty_cache(): | |
| start = time.time() | |
| gc.collect() | |
| torch.cuda.empty_cache() | |
| torch.cuda.reset_max_memory_allocated() | |
| torch.cuda.reset_peak_memory_stats() | |
| print(f"Flush took: {time.time() - start}") | |
| def load_pipeline() -> Pipeline: | |
| empty_cache() | |
| dtype, device = torch.bfloat16, "cuda" | |
| text_encoder_2 = T5EncoderModel.from_pretrained( | |
| "city96/t5-v1_1-xxl-encoder-bf16", torch_dtype=torch.bfloat16 | |
| ) | |
| vae=AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype) | |
| pipeline = DiffusionPipeline.from_pretrained( | |
| ckpt_id, | |
| vae=vae, | |
| text_encoder_2 = text_encoder_2, | |
| torch_dtype=dtype, | |
| ) | |
| torch.backends.cudnn.benchmark = True | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| torch.cuda.set_per_process_memory_fraction(0.99) | |
| pipeline.text_encoder.to(memory_format=torch.channels_last) | |
| pipeline.transformer.to(memory_format=torch.channels_last) | |
| pipeline.vae.to(memory_format=torch.channels_last) | |
| pipeline.vae.enable_tiling() | |
| pipeline.vae = torch.compile(pipeline.vae) | |
| pipeline._exclude_from_cpu_offload = ["vae"] | |
| pipeline.enable_sequential_cpu_offload() | |
| for _ in range(2): | |
| pipeline(prompt="onomancy, aftergo, spirantic, Platyhelmia, modificator, drupaceous, jobbernowl, hereness", width=1024, height=1024, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256) | |
| return pipeline | |
| def infer(request: TextToImageRequest, pipeline: Pipeline) -> Image: | |
| torch.cuda.reset_peak_memory_stats() | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| generator = Generator("cuda").manual_seed(request.seed) | |
| image=pipeline(request.prompt,generator=generator, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256, height=request.height, width=request.width, output_type="pil").images[0] | |
| return(image) |