import os import torch from diffusers import ZImagePipeline from nunchaku.models.transformers.transformer_zimage import NunchakuZImageTransformer2DModel from nunchaku.utils import get_gpu_memory class ZImageTurboBackend: def __init__( self, model_id, optimized_model_path=None, optimized_edit_model_path=None, uma=False, nvfp4_text_encoder_path: str | None = None, ): self.model_id = model_id self.optimized_model_path = optimized_model_path self.pipeline = None self.uma = uma # Optional path to an NVFP4-pack-quantized Qwen3 text encoder. When set, # we load the encoder via vLLM's CompressedTensorsW4A4Fp4 (CUTLASS NVFP4 # GEMM) instead of the bf16 text_encoder shipped inside the Z-Image # base repo. Cuts encoder VRAM ~4x with negligible quality loss # (cosine >0.999 vs the bf16 reference on Thor). self.nvfp4_text_encoder_path = nvfp4_text_encoder_path def _build_nvfp4_text_encoder(self): """Load the NVFP4 text encoder if requested, returns (encoder, tokenizer) or (None, None).""" if not self.nvfp4_text_encoder_path: return None, None print( f"[ZImageTurboBackend] Loading NVFP4 text encoder from {self.nvfp4_text_encoder_path} " "(vLLM CompressedTensorsW4A4Fp4 + CUTLASS NVFP4 GEMM)" ) from NVFP4TextEncoder import load_nvfp4_text_encoder from transformers import AutoTokenizer encoder = load_nvfp4_text_encoder( self.nvfp4_text_encoder_path, device="cuda", dtype=torch.bfloat16, ) tokenizer = AutoTokenizer.from_pretrained(self.nvfp4_text_encoder_path) return encoder, tokenizer def load(self): print(f"Loading ZImageTurboBackend from {self.model_id}...") print(f"Loading NunchakuZImageTransformer2DModel from {self.optimized_model_path}...") # Load transformer (optimized model) transformer = NunchakuZImageTransformer2DModel.from_pretrained(self.optimized_model_path) # If requested, build the NVFP4 text encoder before constructing the pipeline so # diffusers does not also load the bf16 text_encoder from disk (it would double VRAM). nvfp4_encoder, nvfp4_tokenizer = self._build_nvfp4_text_encoder() # Load pipeline print("Initializing ZImagePipeline...") pipeline_kwargs = dict( transformer=transformer, torch_dtype=torch.bfloat16, low_cpu_mem_usage=False, # standard for HF example ) if nvfp4_encoder is not None: # Pass our pre-built encoder so diffusers skips loading the bf16 subfolder. pipeline_kwargs["text_encoder"] = nvfp4_encoder if nvfp4_tokenizer is not None: pipeline_kwargs["tokenizer"] = nvfp4_tokenizer pipeline = ZImagePipeline.from_pretrained(self.model_id, **pipeline_kwargs) gpu_mem = get_gpu_memory() print(f"GPU memory available: {gpu_mem} GB") # Enable Flash Attention 2 try: if hasattr(pipeline.transformer, "set_attention_backend"): pipeline.transformer.set_attention_backend("native") print("Enabled Native SDPA for Z-Image transformer") if hasattr(pipeline.vae, "set_attention_backend"): pipeline.vae.set_attention_backend("native") print("Enabled Native SDPA for Z-Image VAE") except Exception as e: print(f"Could not enable Flash Attention 2: {e}") if self.uma: print("UMA mode enabled: Loading all components to GPU and disabling offloads") # When using the NVFP4 encoder, it is already on CUDA and its quantised parameters # are not compatible with diffusers' generic .to() pathway (e.g. uint8 weight_packed). # We move only the diffusers-managed components (vae, transformer if not nunchaku, ...). if nvfp4_encoder is not None: # Exclude text_encoder from blanket .to('cuda'); it is already on cuda. excl = getattr(pipeline, "_exclude_from_cpu_offload", []) if "text_encoder" not in excl: excl.append("text_encoder") pipeline._exclude_from_cpu_offload = excl for name, comp in pipeline.components.items(): if name == "text_encoder": continue if isinstance(comp, torch.nn.Module): try: comp.to("cuda") except Exception: pass else: pipeline.to("cuda") elif gpu_mem <= 18: print("GPU memory <= 18GB, using sequential cpu offload for low VRAM") # The prompt requested sequential offloading without splitting layers for Nunchaku pipeline._exclude_from_cpu_offload.append("transformer") if nvfp4_encoder is not None: # NVFP4 weights live entirely on CUDA; do not let accelerate move them. pipeline._exclude_from_cpu_offload.append("text_encoder") pipeline.enable_sequential_cpu_offload() transformer.to("cuda") if nvfp4_encoder is not None: nvfp4_encoder.to("cuda") else: print("GPU memory > 18GB, using cpu offload") if nvfp4_encoder is not None: if not hasattr(pipeline, "_exclude_from_cpu_offload"): pipeline._exclude_from_cpu_offload = [] pipeline._exclude_from_cpu_offload.append("text_encoder") pipeline.enable_model_cpu_offload() if nvfp4_encoder is not None: nvfp4_encoder.to("cuda") self.pipeline = pipeline # Return twice for pipeline and edit_pipeline (though Z-Image-Turbo is T2I only) return self.pipeline, self.pipeline