manbeast3b
/

BussinRizzChillGuy

Model card Files Files and versions

xet

Community

manbeast3b commited on Dec 2, 2024

Commit

7eb7442

verified ·

1 Parent(s): c538eba

Update src/pipeline.py

Browse files

Files changed (1) hide show

src/pipeline.py +31 -164

src/pipeline.py CHANGED Viewed

@@ -1,142 +1,13 @@
-# from diffusers import FluxPipeline, AutoencoderKL
-# from diffusers.image_processor import VaeImageProcessor
-# from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel
-# import torch
-# import gc
-# from PIL.Image import Image
-# from pipelines.models import TextToImageRequest
-# from torch import Generator
-# Pipeline = None
-# CHECKPOINT = "black-forest-labs/FLUX.1-schnell"
-# def empty_cache():
-#     gc.collect()
-#     torch.cuda.empty_cache()
-#     torch.cuda.reset_max_memory_allocated()
-#     torch.cuda.reset_peak_memory_stats()
-# def load_pipeline() -> Pipeline:
-#     infer(TextToImageRequest(prompt=""), Pipeline)
-#     return Pipeline
-# def encode_prompt(prompt: str):
-#     text_encoder = CLIPTextModel.from_pretrained(
-#         CHECKPOINT,
-#         subfolder="text_encoder",
-#         torch_dtype=torch.bfloat16,
-#     )
-#     text_encoder_2 = T5EncoderModel.from_pretrained(
-#         CHECKPOINT,
-#         subfolder="text_encoder_2",
-#         torch_dtype=torch.bfloat16,
-#     )
-#     tokenizer = CLIPTokenizer.from_pretrained(CHECKPOINT, subfolder="tokenizer")
-#     tokenizer_2 = T5TokenizerFast.from_pretrained(CHECKPOINT, subfolder="tokenizer_2")
-#     pipeline = FluxPipeline.from_pretrained(
-#         CHECKPOINT,
-#         text_encoder=text_encoder,
-#         text_encoder_2=text_encoder_2,
-#         tokenizer=tokenizer,
-#         tokenizer_2=tokenizer_2,
-#         transformer=None,
-#         vae=None,
-#     ).to("cuda")
-#     with torch.no_grad():
-#         return pipeline.encode_prompt(
-#             prompt=prompt,
-#             prompt_2=None,
-#             max_sequence_length=256,
-#         )
-# def infer_latents(prompt_embeds, pooled_prompt_embeds, width: int | None, height: int | None, seed: int | None):
-#     pipeline = FluxPipeline.from_pretrained(
-#         CHECKPOINT,
-#         text_encoder=None,
-#         text_encoder_2=None,
-#         tokenizer=None,
-#         tokenizer_2=None,
-#         vae=None,
-#         torch_dtype=torch.bfloat16,
-#     ).to("cuda")
-#     if seed is None:
-#         generator = None
-#     else:
-#         generator = Generator(pipeline.device).manual_seed(seed)
-#     return pipeline(
-#         prompt_embeds=prompt_embeds,
-#         pooled_prompt_embeds=pooled_prompt_embeds,
-#         num_inference_steps=4,
-#         guidance_scale=0.0,
-#         width=width,
-#         height=height,
-#         generator=generator,
-#         output_type="latent",
-#     ).images
-# def infer(request: TextToImageRequest, _pipeline: Pipeline) -> Image:
-#     empty_cache()
-#     prompt_embeds, pooled_prompt_embeds, text_ids = encode_prompt(request.prompt)
-#     empty_cache()
-#     latents = infer_latents(prompt_embeds, pooled_prompt_embeds, request.width, request.height, request.seed)
-#     empty_cache()
-#     vae = AutoencoderKL.from_pretrained(
-#         CHECKPOINT,
-#         subfolder="vae",
-#         torch_dtype=torch.bfloat16,
-#     ).to("cuda")
-#     vae_scale_factor = 2 ** (len(vae.config.block_out_channels))
-#     image_processor = VaeImageProcessor(vae_scale_factor=vae_scale_factor)
-#     height = request.height or 64 * vae_scale_factor
-#     width = request.width or 64 * vae_scale_factor
-#     with torch.no_grad():
-#         latents = FluxPipeline._unpack_latents(latents, height, width, vae_scale_factor)
-#         latents = (latents / vae.config.scaling_factor) + vae.config.shift_factor
-#         image = vae.decode(latents, return_dict=False)[0]
-#         return image_processor.postprocess(image, output_type="pil")[0]
-from diffusers import FluxPipeline, AutoencoderKL, AutoencoderTiny
-from diffusers.image_processor import VaeImageProcessor
-from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
-from transformers import T5EncoderModel, T5TokenizerFast, CLIPTokenizer, CLIPTextModel
 import torch
 import torch.nn as nn
-import torch._dynamo
 import gc
-from PIL import Image as img
 from PIL.Image import Image
 from pipelines.models import TextToImageRequest
 from torch import Generator
-import time
-from diffusers import FluxTransformer2DModel, DiffusionPipeline
-# from torchao.quantization import quantize_,int8_weight_only
 import os
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01"
-Pipeline = None
 def fp8_linear_forward(cls, original_dtype, input):
     weight_dtype = cls.weight.dtype
@@ -171,76 +42,72 @@ def fp8_linear_forward(cls, original_dtype, input):
     else:
         return cls.original_forward(input)
-def convert_fp8_linear(module, original_dtype):
     setattr(module, "fp8_matmul_enabled", True)
     for name, module in module.named_modules():
         if isinstance(module, nn.Linear):
             if "blocks" in name:
-                print("changing")
-                #print(module, name)
                 original_forward = module.forward
                 setattr(module, "original_forward", original_forward)
                 setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input))
-def replace_with_fp8_linear(transformer, original_dtype):
-    """
-    Replace all nn.Linear layers in the transformer with FP8-enabled linear layers.
-    """
     for name, module in transformer.named_modules():
         if isinstance(module, nn.Linear):
-            # Use your custom function to convert to FP8 Linear
-            convert_fp8_linear(module, original_dtype)
     return transformer
-ckpt_id = "black-forest-labs/FLUX.1-schnell"
-def empty_cache():
-    start = time.time()
     gc.collect()
-    torch.cuda.empty_cache()
     torch.cuda.reset_max_memory_allocated()
     torch.cuda.reset_peak_memory_stats()
-    print(f"Flush took: {time.time() - start}")
 def load_pipeline() -> Pipeline:
-    empty_cache()
-    dtype, device = torch.bfloat16, "cuda"
-    text_encoder_2 = T5EncoderModel.from_pretrained(
-        "city96/t5-v1_1-xxl-encoder-bf16", torch_dtype=torch.bfloat16
-    )
-    vae=AutoencoderKL.from_pretrained(ckpt_id, subfolder="vae", torch_dtype=dtype)
     pipeline = DiffusionPipeline.from_pretrained(
-        ckpt_id,
         vae=vae,
-        text_encoder_2 = text_encoder_2,
-        torch_dtype=dtype,
         )
     torch.backends.cudnn.benchmark = True
     torch.backends.cuda.matmul.allow_tf32 = True
     torch.cuda.set_per_process_memory_fraction(0.9)
     pipeline.text_encoder.to(memory_format=torch.channels_last)
     pipeline.transformer.to(memory_format=torch.channels_last)
-    # Replace Linear layers in the Transformer with FP8 Linear layers
-    # pipeline.text_encoder_2 = replace_with_fp8_linear(pipeline.text_encoder_2, original_dtype=dtype)
-    pipeline.vae = replace_with_fp8_linear(pipeline.vae, original_dtype=dtype)
     pipeline.vae.to(memory_format=torch.channels_last)
     pipeline.vae = torch.compile(pipeline.vae)
     pipeline._exclude_from_cpu_offload = ["vae"]
     pipeline.enable_sequential_cpu_offload()
-    for _ in range(2):
         pipeline(prompt="onomancy, aftergo, spirantic, Platyhelmia, modificator, drupaceous, jobbernowl, hereness", width=1024, height=1024, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256)
     return pipeline
 @torch.inference_mode()
 def infer(request: TextToImageRequest, pipeline: Pipeline) -> Image:
-    torch.cuda.reset_peak_memory_stats()
     generator = Generator("cuda").manual_seed(request.seed)
     image=pipeline(request.prompt,generator=generator, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256, height=request.height, width=request.width, output_type="pil").images[0]
     return(image)

+from diffusers import AutoencoderKL
 import torch
 import torch.nn as nn
 import gc
 from PIL.Image import Image
 from pipelines.models import TextToImageRequest
 from torch import Generator
+from diffusers import DiffusionPipeline
 import os
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01"
 def fp8_linear_forward(cls, original_dtype, input):
     weight_dtype = cls.weight.dtype
     else:
         return cls.original_forward(input)
+def convert(module, original_dtype):
     setattr(module, "fp8_matmul_enabled", True)
     for name, module in module.named_modules():
         if isinstance(module, nn.Linear):
             if "blocks" in name:
                 original_forward = module.forward
                 setattr(module, "original_forward", original_forward)
                 setattr(module, "forward", lambda input, m=module: fp8_linear_forward(m, original_dtype, input))
+def replace(transformer, original_dtype):
     for name, module in transformer.named_modules():
         if isinstance(module, nn.Linear):
+            convert(module, original_dtype)
     return transformer
+Pipeline = None
+MODEL_ID = "black-forest-labs/FLUX.1-schnell"
+DTYPE = torch.bfloat16
+def clear():
     gc.collect()
     torch.cuda.reset_max_memory_allocated()
     torch.cuda.reset_peak_memory_stats()
+    torch.cuda.empty_cache()
 def load_pipeline() -> Pipeline:
+    # restart
+    clear()
+    # setup
+    text_encoder_2 = T5EncoderModel.from_pretrained("city96/t5-v1_1-xxl-encoder-bf16", torch_dtype=DTYPE)
+    vae=AutoencoderKL.from_pretrained(MODEL_ID, subfolder="vae", torch_dtype=DTYPE)
     pipeline = DiffusionPipeline.from_pretrained(
+        MODEL_ID,
         vae=vae,
+        text_encoder_2=text_encoder_2,
+        torch_dtype=DTYPE,
         )
+    # optimize
     torch.backends.cudnn.benchmark = True
     torch.backends.cuda.matmul.allow_tf32 = True
     torch.cuda.set_per_process_memory_fraction(0.9)
     pipeline.text_encoder.to(memory_format=torch.channels_last)
+    pipeline.text_encoder_2.to(memory_format=torch.channels_last)
     pipeline.transformer.to(memory_format=torch.channels_last)
+    pipeline.vae = replace(pipeline.vae, original_dtype=dtype)
     pipeline.vae.to(memory_format=torch.channels_last)
     pipeline.vae = torch.compile(pipeline.vae)
     pipeline._exclude_from_cpu_offload = ["vae"]
     pipeline.enable_sequential_cpu_offload()
+    # warm up once
+    clear()
+    for _ in range(1):
         pipeline(prompt="onomancy, aftergo, spirantic, Platyhelmia, modificator, drupaceous, jobbernowl, hereness", width=1024, height=1024, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256)
     return pipeline
+sample = True
 @torch.inference_mode()
 def infer(request: TextToImageRequest, pipeline: Pipeline) -> Image:
+    global sample
+    if sample:
+        clear()
+        sample = None
     generator = Generator("cuda").manual_seed(request.seed)
     image=pipeline(request.prompt,generator=generator, guidance_scale=0.0, num_inference_steps=4, max_sequence_length=256, height=request.height, width=request.width, output_type="pil").images[0]
     return(image)