NoMoreCopyrightOrg
/

flux-test2

English

Model card Files Files and versions

xet

Community

John6666 commited on Mar 6, 2025

Commit

2b7dcd2

verified ·

1 Parent(s): e23490b

Upload handler.py

Browse files

Files changed (1) hide show

handler.py +25 -7

handler.py CHANGED Viewed

@@ -2,20 +2,24 @@
 import os
 from typing import Any, Dict
-from diffusers import FluxPipeline, FluxTransformer2DModel, AutoencoderKL, TorchAoConfig
 from PIL import Image
-import torch
-from torchao.quantization import quantize_, autoquant, int8_dynamic_activation_int8_weight, int8_dynamic_activation_int4_weight
 from huggingface_hub import hf_hub_download
-import gc
 import subprocess
 subprocess.run("pip list", shell=True)
 IS_COMPILE = True
 IS_TURBO = False
-IS_4BIT = True
 #if IS_COMPILE:
 #    import torch._dynamo
@@ -92,12 +96,26 @@ def load_pipeline_turbo_compile(repo_id: str, dtype: torch.dtype) -> Any:
     pipe.to("cuda")
     return pipe
 class EndpointHandler:
     def __init__(self, path=""):
         repo_id = "NoMoreCopyrightOrg/flux-dev-8step" if IS_TURBO else "NoMoreCopyrightOrg/flux-dev"
         dtype = torch.bfloat16
         #dtype = torch.float16 # for older nVidia GPUs
-        if IS_COMPILE: self.pipeline = load_pipeline_compile(repo_id, dtype)
         else: self.pipeline = load_pipeline_stable(repo_id, dtype)
         gc.collect()
         torch.cuda.empty_cache()

 import os
 from typing import Any, Dict
+import gc
 from PIL import Image
 from huggingface_hub import hf_hub_download
+import torch
+from torchao.quantization import quantize_, autoquant, int8_dynamic_activation_int8_weight, int8_dynamic_activation_int4_weight, float8_dynamic_activation_float8_weight
+from torchao.quantization.quant_api import PerRow
+from diffusers import FluxPipeline, FluxTransformer2DModel, AutoencoderKL, TorchAoConfig
+# Set high precision for float32 matrix multiplications.
+# This setting optimizes performance on NVIDIA GPUs with Ampere architecture (e.g., A100, RTX 30 series) or newer.
+torch.set_float32_matmul_precision("high")
 import subprocess
 subprocess.run("pip list", shell=True)
 IS_COMPILE = True
 IS_TURBO = False
+IS_4BIT = False
 #if IS_COMPILE:
 #    import torch._dynamo
     pipe.to("cuda")
     return pipe
+def load_pipeline_opt(repo_id: str, dtype: torch.dtype) -> Any:
+    transformer = FluxTransformer2DModel.from_pretrained(repo_id, subfolder="transformer", torch_dtype=dtype)
+    transformer.fuse_qkv_projections()
+    quantize_(transformer, float8_dynamic_activation_float8_weight(granularity=PerRow()), device="cuda")
+    transformer.to(memory_format=torch.channels_last)
+    transformer = torch.compile(transformer, mode="max-autotune", fullgraph=True)
+    pipe = FluxPipeline.from_pretrained(repo_id, torch_dtype=dtype, transformer=transformer).to("cuda")
+    pipe.vae.fuse_qkv_projections()
+    quantize_(pipe.vae, float8_dynamic_activation_float8_weight(granularity=PerRow()), device="cuda")
+    pipe.vae.to(memory_format=torch.channels_last)
+    pipe.vae = torch.compile(pipe.vae, mode="max-autotune", fullgraph=True)
+    pipe.to("cuda")
+    return pipe
 class EndpointHandler:
     def __init__(self, path=""):
         repo_id = "NoMoreCopyrightOrg/flux-dev-8step" if IS_TURBO else "NoMoreCopyrightOrg/flux-dev"
         dtype = torch.bfloat16
         #dtype = torch.float16 # for older nVidia GPUs
+        if IS_COMPILE: self.pipeline = load_pipeline_opt(repo_id, dtype)
         else: self.pipeline = load_pipeline_stable(repo_id, dtype)
         gc.collect()
         torch.cuda.empty_cache()