manbeast3b
/

optimization2

Model card Files Files and versions

xet

Community

manbeast3b commited on Dec 10, 2024

Commit

f34ee03

verified ·

1 Parent(s): af3affd

Update src/pipeline.py

Browse files

Files changed (1) hide show

src/pipeline.py +14 -11

src/pipeline.py CHANGED Viewed

@@ -15,7 +15,7 @@ Pipeline = None
 # Configure CUDA settings
 torch.backends.cudnn.benchmark = True
 torch.backends.cuda.matmul.allow_tf32 = True
-torch.cuda.set_per_process_memory_fraction(0.999)
 class BasicQuantization:
     def __init__(self, bits=1):
@@ -59,14 +59,14 @@ def load_pipeline() -> Pipeline:
     quantizer = ModelQuantization(vae)
     quantizer.quantize_model()
-    text_encoder_2 = T5EncoderModel.from_pretrained(
-        "city96/t5-v1_1-xxl-encoder-bf16", torch_dtype=torch.bfloat16
-    )
     # Initialize pipeline
     pipeline = FluxPipeline.from_pretrained(
         ckpt_id,
-        text_encoder_2=text_encoder_2,
         vae=vae,
         torch_dtype=dtype
     )
@@ -77,17 +77,20 @@ def load_pipeline() -> Pipeline:
         component.to(memory_format=torch.channels_last)
     # Compile and configure pipeline
-    pipeline.vae = torch.compile(pipeline.vae, fullgraph=True, dynamic=False, mode="max-autotune")
     pipeline._exclude_from_cpu_offload = ["vae"]
     pipeline.enable_sequential_cpu_offload()
     # Warmup run
     empty_cache()
-    for _ in range(3):
         pipeline(
             prompt="posteroexternal, eurythmical, inspection, semicotton, specification, Mercatorial, ethylate, misprint",
-            width=1024,
-            height=1024,
             guidance_scale=0.0,
             num_inference_steps=4,
             max_sequence_length=256
@@ -111,7 +114,7 @@ def infer(request: TextToImageRequest, pipeline: Pipeline) -> Image:
         empty_cache()
         _inference_count = 0
-    torch.cuda.reset_peak_memory_stats()
     generator = Generator("cuda").manual_seed(request.seed)
     return pipeline(
             prompt=request.prompt,

 # Configure CUDA settings
 torch.backends.cudnn.benchmark = True
 torch.backends.cuda.matmul.allow_tf32 = True
+torch.cuda.set_per_process_memory_fraction(0.99)
 class BasicQuantization:
     def __init__(self, bits=1):
     quantizer = ModelQuantization(vae)
     quantizer.quantize_model()
+    # text_encoder_2 = T5EncoderModel.from_pretrained(
+    #     "city96/t5-v1_1-xxl-encoder-bf16", torch_dtype=torch.bfloat16
+    # )
     # Initialize pipeline
     pipeline = FluxPipeline.from_pretrained(
         ckpt_id,
+        # text_encoder_2=text_encoder_2,
         vae=vae,
         torch_dtype=dtype
     )
         component.to(memory_format=torch.channels_last)
     # Compile and configure pipeline
+    pipeline.vae = torch.compile(pipe.vae, mode="reduce-overhead")
     pipeline._exclude_from_cpu_offload = ["vae"]
     pipeline.enable_sequential_cpu_offload()
+    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+    pipeline.vae.encoder.to(device)
+    pipeline.vae.decoder.to(device)
     # Warmup run
     empty_cache()
+    for _ in range(2):
         pipeline(
             prompt="posteroexternal, eurythmical, inspection, semicotton, specification, Mercatorial, ethylate, misprint",
+            width=1480,
+            height=1480,
             guidance_scale=0.0,
             num_inference_steps=4,
             max_sequence_length=256
         empty_cache()
         _inference_count = 0
+    # torch.cuda.reset_peak_memory_stats()
     generator = Generator("cuda").manual_seed(request.seed)
     return pipeline(
             prompt=request.prompt,