manbeast3b
/

perfbench0test2

Model card Files Files and versions

manbeast3b commited on Dec 3, 2024

Commit

38fc09e

·

verified ·

1 Parent(s): 796bc4c

Update src/pipeline.py

Files changed (1) hide show

src/pipeline.py +38 -2

src/pipeline.py CHANGED Viewed

@@ -14,6 +14,12 @@ import time
 from diffusers import FluxTransformer2DModel, DiffusionPipeline
 # from torchao.quantization import quantize_,int8_weight_only
 import os
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01"
 Pipeline = None
@@ -44,8 +50,38 @@ def load_pipeline() -> Pipeline:
     torch.backends.cuda.matmul.allow_tf32 = True
     torch.cuda.set_per_process_memory_fraction(0.99)
     pipeline.text_encoder.to(memory_format=torch.channels_last)
-    pipeline.transformer.to(memory_format=torch.channels_last)
-    quantize_dynamic(pipeline.transformer, dtype=torch.float8_e5m2fnuz, inplace=True)
     pipeline.vae.to(memory_format=torch.channels_last)
     pipeline.vae = torch.compile(pipeline.vae)

 from diffusers import FluxTransformer2DModel, DiffusionPipeline
 # from torchao.quantization import quantize_,int8_weight_only
 import os
+from torch.ao.quantization import prepare, convert
+from torch.ao.quantization import QConfig
+from torch.ao.quantization.observer import MinMaxObserver
+from torch.ao.quantization.quantize import quantize_dynamic
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:False,garbage_collection_threshold:0.01"
 Pipeline = None
     torch.backends.cuda.matmul.allow_tf32 = True
     torch.cuda.set_per_process_memory_fraction(0.99)
     pipeline.text_encoder.to(memory_format=torch.channels_last)
+    # pipeline.transformer.to(memory_format=torch.channels_last)
+    # quantize_dynamic(pipeline.transformer, dtype=torch.float8_e5m2fnuz, inplace=True)
+    # Define a custom qconfig for float8_e5m2fnuz
+    float8_observer = MinMaxObserver.with_args(dtype=torch.float8_e5m2fnuz)
+    custom_qconfig = QConfig(
+        activation=float8_observer,
+        weight=float8_observer
+    )
+    qconfig_spec = {
+        "linear": custom_qconfig,
+        "linear_1": custom_qconfig,
+        "linear_2": custom_qconfig,
+        "to_q": custom_qconfig,
+        "to_k": custom_qconfig,
+        "to_v": custom_qconfig,
+        "add_k_proj": custom_qconfig,
+        "add_v_proj": custom_qconfig,
+        "add_q_proj": custom_qconfig,
+        "proj": custom_qconfig,
+        "proj_mlp": custom_qconfig,
+        "proj_out": custom_qconfig
+    }
+    # Apply dynamic quantization to Transformer
+    pipeline.transformer = quantize_dynamic(
+        pipeline.transformer,
+        qconfig_spec=qconfig_spec,  # Apply qconfig only to transformer layers
+        dtype=torch.float8_e5m2fnuz,
+        inplace=True,
+    )
     pipeline.vae.to(memory_format=torch.channels_last)
     pipeline.vae = torch.compile(pipeline.vae)