NoMoreCopyrightOrg
/

flux-test2

English

Model card Files Files and versions

xet

Community

John6666 commited on Mar 9, 2025

Commit

5eb864d

verified ·

1 Parent(s): da6f0ba

Upload handler.py

Browse files

Files changed (1) hide show

handler.py +6 -4

handler.py CHANGED Viewed

@@ -30,7 +30,7 @@ IS_LVRAM = False
 IS_COMPILE = True
 IS_WARM = True
 IS_QUANT = True
-IS_AUTOQ = False
 IS_CC90 = True if torch.cuda.get_device_capability() >= (9, 0) else False
 IS_CC89 = True if torch.cuda.get_device_capability() >= (8, 9) else False
@@ -217,8 +217,7 @@ def load_pipeline_fast(repo_id: str, dtype: torch.dtype) -> Any:
     pipe.transformer.to(memory_format=torch.channels_last)
     pipe.vae.to(memory_format=torch.channels_last)
     apply_cache_on_pipe(pipe, residual_diff_threshold=0.12)
-    if IS_QUANT:
-        int8_dynamic_activation_int4_weight()
         quantize_(pipe.text_encoder, int8_dynamic_activation_int8_weight())
         quantize_(pipe.text_encoder_2, int8_dynamic_activation_int8_weight())
         if IS_CC90: quantize_(pipe.transformer, float8_dynamic_activation_float8_weight(granularity=PerRow()), device="cuda")
@@ -237,7 +236,7 @@ class EndpointHandler:
         #dtype = torch.float16 # for older nVidia GPUs
         print_vram()
         print("Loading pipeline...")
-        if IS_AUTOQ: self.pipeline = load_pipeline_autoquant(repo_id, dtype)
         elif IS_COMPILE: self.pipeline = load_pipeline_fast(repo_id, dtype)
         elif IS_LVRAM and IS_CC89: self.pipeline = load_pipeline_lowvram(repo_id, dtype)
         else: self.pipeline = load_pipeline_stable(repo_id, dtype)
@@ -250,6 +249,9 @@ class EndpointHandler:
             print("Compiling pipeline...")
             self.pipeline.transformer = torch.compile(self.pipeline.transformer, mode="max-autotune-no-cudagraphs")
             self.pipeline.vae = torch.compile(self.pipeline.vae, mode="max-autotune-no-cudagraphs")
         gc.collect()
         torch.cuda.empty_cache()
         print_vram()

 IS_COMPILE = True
 IS_WARM = True
 IS_QUANT = True
+IS_AUTOQ = True
 IS_CC90 = True if torch.cuda.get_device_capability() >= (9, 0) else False
 IS_CC89 = True if torch.cuda.get_device_capability() >= (8, 9) else False
     pipe.transformer.to(memory_format=torch.channels_last)
     pipe.vae.to(memory_format=torch.channels_last)
     apply_cache_on_pipe(pipe, residual_diff_threshold=0.12)
+    if IS_QUANT and not IS_AUTOQ:
         quantize_(pipe.text_encoder, int8_dynamic_activation_int8_weight())
         quantize_(pipe.text_encoder_2, int8_dynamic_activation_int8_weight())
         if IS_CC90: quantize_(pipe.transformer, float8_dynamic_activation_float8_weight(granularity=PerRow()), device="cuda")
         #dtype = torch.float16 # for older nVidia GPUs
         print_vram()
         print("Loading pipeline...")
+        if IS_AUTOQ: self.pipeline = load_pipeline_fast(repo_id, dtype)
         elif IS_COMPILE: self.pipeline = load_pipeline_fast(repo_id, dtype)
         elif IS_LVRAM and IS_CC89: self.pipeline = load_pipeline_lowvram(repo_id, dtype)
         else: self.pipeline = load_pipeline_stable(repo_id, dtype)
             print("Compiling pipeline...")
             self.pipeline.transformer = torch.compile(self.pipeline.transformer, mode="max-autotune-no-cudagraphs")
             self.pipeline.vae = torch.compile(self.pipeline.vae, mode="max-autotune-no-cudagraphs")
+            if IS_AUTOQ:
+                self.pipeline.transformer = autoquant(self.pipeline.transformer, error_on_unseen=False)
+                self.pipeline.vae = autoquant(self.pipeline.vae, error_on_unseen=False)
         gc.collect()
         torch.cuda.empty_cache()
         print_vram()