| import os |
| import torch |
| from diffusers import ZImagePipeline |
| from nunchaku.models.transformers.transformer_zimage import NunchakuZImageTransformer2DModel |
| from nunchaku.utils import get_gpu_memory |
|
|
|
|
| class ZImageTurboBackend: |
| def __init__( |
| self, |
| model_id, |
| optimized_model_path=None, |
| optimized_edit_model_path=None, |
| uma=False, |
| nvfp4_text_encoder_path: str | None = None, |
| ): |
| self.model_id = model_id |
| self.optimized_model_path = optimized_model_path |
| self.pipeline = None |
| self.uma = uma |
| |
| |
| |
| |
| |
| self.nvfp4_text_encoder_path = nvfp4_text_encoder_path |
|
|
| def _build_nvfp4_text_encoder(self): |
| """Load the NVFP4 text encoder if requested, returns (encoder, tokenizer) or (None, None).""" |
| if not self.nvfp4_text_encoder_path: |
| return None, None |
| print( |
| f"[ZImageTurboBackend] Loading NVFP4 text encoder from {self.nvfp4_text_encoder_path} " |
| "(vLLM CompressedTensorsW4A4Fp4 + CUTLASS NVFP4 GEMM)" |
| ) |
| from NVFP4TextEncoder import load_nvfp4_text_encoder |
| from transformers import AutoTokenizer |
|
|
| encoder = load_nvfp4_text_encoder( |
| self.nvfp4_text_encoder_path, |
| device="cuda", |
| dtype=torch.bfloat16, |
| ) |
| tokenizer = AutoTokenizer.from_pretrained(self.nvfp4_text_encoder_path) |
| return encoder, tokenizer |
|
|
| def load(self): |
| print(f"Loading ZImageTurboBackend from {self.model_id}...") |
| print(f"Loading NunchakuZImageTransformer2DModel from {self.optimized_model_path}...") |
|
|
| |
| transformer = NunchakuZImageTransformer2DModel.from_pretrained(self.optimized_model_path) |
|
|
| |
| |
| nvfp4_encoder, nvfp4_tokenizer = self._build_nvfp4_text_encoder() |
|
|
| |
| print("Initializing ZImagePipeline...") |
| pipeline_kwargs = dict( |
| transformer=transformer, |
| torch_dtype=torch.bfloat16, |
| low_cpu_mem_usage=False, |
| ) |
| if nvfp4_encoder is not None: |
| |
| pipeline_kwargs["text_encoder"] = nvfp4_encoder |
| if nvfp4_tokenizer is not None: |
| pipeline_kwargs["tokenizer"] = nvfp4_tokenizer |
|
|
| pipeline = ZImagePipeline.from_pretrained(self.model_id, **pipeline_kwargs) |
|
|
| gpu_mem = get_gpu_memory() |
| print(f"GPU memory available: {gpu_mem} GB") |
|
|
| |
| try: |
| if hasattr(pipeline.transformer, "set_attention_backend"): |
| pipeline.transformer.set_attention_backend("native") |
| print("Enabled Native SDPA for Z-Image transformer") |
| if hasattr(pipeline.vae, "set_attention_backend"): |
| pipeline.vae.set_attention_backend("native") |
| print("Enabled Native SDPA for Z-Image VAE") |
| except Exception as e: |
| print(f"Could not enable Flash Attention 2: {e}") |
|
|
| if self.uma: |
| print("UMA mode enabled: Loading all components to GPU and disabling offloads") |
| |
| |
| |
| if nvfp4_encoder is not None: |
| |
| excl = getattr(pipeline, "_exclude_from_cpu_offload", []) |
| if "text_encoder" not in excl: |
| excl.append("text_encoder") |
| pipeline._exclude_from_cpu_offload = excl |
| for name, comp in pipeline.components.items(): |
| if name == "text_encoder": |
| continue |
| if isinstance(comp, torch.nn.Module): |
| try: |
| comp.to("cuda") |
| except Exception: |
| pass |
| else: |
| pipeline.to("cuda") |
| elif gpu_mem <= 18: |
| print("GPU memory <= 18GB, using sequential cpu offload for low VRAM") |
| |
| pipeline._exclude_from_cpu_offload.append("transformer") |
| if nvfp4_encoder is not None: |
| |
| pipeline._exclude_from_cpu_offload.append("text_encoder") |
| pipeline.enable_sequential_cpu_offload() |
| transformer.to("cuda") |
| if nvfp4_encoder is not None: |
| nvfp4_encoder.to("cuda") |
| else: |
| print("GPU memory > 18GB, using cpu offload") |
| if nvfp4_encoder is not None: |
| if not hasattr(pipeline, "_exclude_from_cpu_offload"): |
| pipeline._exclude_from_cpu_offload = [] |
| pipeline._exclude_from_cpu_offload.append("text_encoder") |
| pipeline.enable_model_cpu_offload() |
| if nvfp4_encoder is not None: |
| nvfp4_encoder.to("cuda") |
|
|
| self.pipeline = pipeline |
| |
| return self.pipeline, self.pipeline |
|
|