NoMoreCopyrightOrg
/

flux-test2

English

Model card Files Files and versions

xet

Community

John6666 commited on Mar 9, 2025

Commit

da6f0ba

verified ·

1 Parent(s): 3a1fdfd

Upload handler.py

Browse files

Files changed (1) hide show

handler.py +19 -16

handler.py CHANGED Viewed

@@ -29,6 +29,7 @@ IS_MGPU = False
 IS_LVRAM = False
 IS_COMPILE = True
 IS_WARM = True
 IS_AUTOQ = False
 IS_CC90 = True if torch.cuda.get_device_capability() >= (9, 0) else False
 IS_CC89 = True if torch.cuda.get_device_capability() >= (8, 9) else False
@@ -41,11 +42,11 @@ if IS_COMPILE:
     import torch._dynamo
     torch._dynamo.config.suppress_errors = False
     #torch._dynamo.config.suppress_errors = True
-    torch._inductor.config.disable_progress = False
-    torch._inductor.config.conv_1x1_as_mm = True
-    torch._inductor.config.coordinate_descent_tuning = True
-    torch._inductor.config.coordinate_descent_check_all_directions = True
-    torch._inductor.config.epilogue_fusion = False
 if IS_MGPU:
     import torch.distributed as dist
@@ -211,19 +212,21 @@ def load_pipeline_fast(repo_id: str, dtype: torch.dtype) -> Any:
     pipe = FluxPipeline.from_pretrained(repo_id, torch_dtype=dtype).to("cuda")
     pipe.enable_vae_slicing()
     pipe.enable_vae_tiling()
-    apply_cache_on_pipe(pipe, residual_diff_threshold=0.12)
-    if IS_CC90: quantize_(pipe.transformer, float8_dynamic_activation_float8_weight(granularity=PerRow()), device="cuda")
-    elif IS_CC89: quantize_(pipe.transformer, float8_dynamic_activation_float8_weight(), device="cuda")
-    #pipe.transformer.fuse_qkv_projections()
     pipe.transformer.to(memory_format=torch.channels_last)
-    #pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune", fullgraph=True)
-    #pipe.transformer = torch.compile(pipe.transformer, mode="max-autotune-no-cudagraphs")
-    if IS_CC90: quantize_(pipe.vae, float8_dynamic_activation_float8_weight(granularity=PerRow()), device="cuda")
-    elif IS_CC89: quantize_(pipe.vae, float8_dynamic_activation_float8_weight(), device="cuda")
-    #pipe.vae.fuse_qkv_projections()
     pipe.vae.to(memory_format=torch.channels_last)
-    #pipe.vae = torch.compile(pipe.vae, mode="max-autotune", fullgraph=True)
-    #pipe.vae = torch.compile(pipe.vae, mode="max-autotune-no-cudagraphs")
     return pipe
 class EndpointHandler:

 IS_LVRAM = False
 IS_COMPILE = True
 IS_WARM = True
+IS_QUANT = True
 IS_AUTOQ = False
 IS_CC90 = True if torch.cuda.get_device_capability() >= (9, 0) else False
 IS_CC89 = True if torch.cuda.get_device_capability() >= (8, 9) else False
     import torch._dynamo
     torch._dynamo.config.suppress_errors = False
     #torch._dynamo.config.suppress_errors = True
+    #torch._inductor.config.disable_progress = False
+    #torch._inductor.config.conv_1x1_as_mm = True
+    #torch._inductor.config.coordinate_descent_tuning = True
+    #torch._inductor.config.coordinate_descent_check_all_directions = True
+    #torch._inductor.config.epilogue_fusion = False
 if IS_MGPU:
     import torch.distributed as dist
     pipe = FluxPipeline.from_pretrained(repo_id, torch_dtype=dtype).to("cuda")
     pipe.enable_vae_slicing()
     pipe.enable_vae_tiling()
+    pipe.transformer.fuse_qkv_projections()
+    pipe.vae.fuse_qkv_projections()
     pipe.transformer.to(memory_format=torch.channels_last)
     pipe.vae.to(memory_format=torch.channels_last)
+    apply_cache_on_pipe(pipe, residual_diff_threshold=0.12)
+    if IS_QUANT:
+        int8_dynamic_activation_int4_weight()
+        quantize_(pipe.text_encoder, int8_dynamic_activation_int8_weight())
+        quantize_(pipe.text_encoder_2, int8_dynamic_activation_int8_weight())
+        if IS_CC90: quantize_(pipe.transformer, float8_dynamic_activation_float8_weight(granularity=PerRow()), device="cuda")
+        elif IS_CC89: quantize_(pipe.transformer, float8_dynamic_activation_float8_weight(), device="cuda")
+        else: quantize_(pipe.vae, int8_dynamic_activation_int4_weight())
+        if IS_CC90: quantize_(pipe.vae, float8_dynamic_activation_float8_weight(granularity=PerRow()), device="cuda")
+        elif IS_CC89: quantize_(pipe.vae, float8_dynamic_activation_float8_weight(), device="cuda")
+        else: quantize_(pipe.vae, int8_dynamic_activation_int8_weight())
     return pipe
 class EndpointHandler: