catplusplus's picture
Upload folder using huggingface_hub
1e103b7 verified
import os
import torch
from diffusers import ZImagePipeline
from nunchaku.models.transformers.transformer_zimage import NunchakuZImageTransformer2DModel
from nunchaku.utils import get_gpu_memory
class ZImageTurboBackend:
def __init__(
self,
model_id,
optimized_model_path=None,
optimized_edit_model_path=None,
uma=False,
nvfp4_text_encoder_path: str | None = None,
):
self.model_id = model_id
self.optimized_model_path = optimized_model_path
self.pipeline = None
self.uma = uma
# Optional path to an NVFP4-pack-quantized Qwen3 text encoder. When set,
# we load the encoder via vLLM's CompressedTensorsW4A4Fp4 (CUTLASS NVFP4
# GEMM) instead of the bf16 text_encoder shipped inside the Z-Image
# base repo. Cuts encoder VRAM ~4x with negligible quality loss
# (cosine >0.999 vs the bf16 reference on Thor).
self.nvfp4_text_encoder_path = nvfp4_text_encoder_path
def _build_nvfp4_text_encoder(self):
"""Load the NVFP4 text encoder if requested, returns (encoder, tokenizer) or (None, None)."""
if not self.nvfp4_text_encoder_path:
return None, None
print(
f"[ZImageTurboBackend] Loading NVFP4 text encoder from {self.nvfp4_text_encoder_path} "
"(vLLM CompressedTensorsW4A4Fp4 + CUTLASS NVFP4 GEMM)"
)
from NVFP4TextEncoder import load_nvfp4_text_encoder
from transformers import AutoTokenizer
encoder = load_nvfp4_text_encoder(
self.nvfp4_text_encoder_path,
device="cuda",
dtype=torch.bfloat16,
)
tokenizer = AutoTokenizer.from_pretrained(self.nvfp4_text_encoder_path)
return encoder, tokenizer
def load(self):
print(f"Loading ZImageTurboBackend from {self.model_id}...")
print(f"Loading NunchakuZImageTransformer2DModel from {self.optimized_model_path}...")
# Load transformer (optimized model)
transformer = NunchakuZImageTransformer2DModel.from_pretrained(self.optimized_model_path)
# If requested, build the NVFP4 text encoder before constructing the pipeline so
# diffusers does not also load the bf16 text_encoder from disk (it would double VRAM).
nvfp4_encoder, nvfp4_tokenizer = self._build_nvfp4_text_encoder()
# Load pipeline
print("Initializing ZImagePipeline...")
pipeline_kwargs = dict(
transformer=transformer,
torch_dtype=torch.bfloat16,
low_cpu_mem_usage=False, # standard for HF example
)
if nvfp4_encoder is not None:
# Pass our pre-built encoder so diffusers skips loading the bf16 subfolder.
pipeline_kwargs["text_encoder"] = nvfp4_encoder
if nvfp4_tokenizer is not None:
pipeline_kwargs["tokenizer"] = nvfp4_tokenizer
pipeline = ZImagePipeline.from_pretrained(self.model_id, **pipeline_kwargs)
gpu_mem = get_gpu_memory()
print(f"GPU memory available: {gpu_mem} GB")
# Enable Flash Attention 2
try:
if hasattr(pipeline.transformer, "set_attention_backend"):
pipeline.transformer.set_attention_backend("native")
print("Enabled Native SDPA for Z-Image transformer")
if hasattr(pipeline.vae, "set_attention_backend"):
pipeline.vae.set_attention_backend("native")
print("Enabled Native SDPA for Z-Image VAE")
except Exception as e:
print(f"Could not enable Flash Attention 2: {e}")
if self.uma:
print("UMA mode enabled: Loading all components to GPU and disabling offloads")
# When using the NVFP4 encoder, it is already on CUDA and its quantised parameters
# are not compatible with diffusers' generic .to() pathway (e.g. uint8 weight_packed).
# We move only the diffusers-managed components (vae, transformer if not nunchaku, ...).
if nvfp4_encoder is not None:
# Exclude text_encoder from blanket .to('cuda'); it is already on cuda.
excl = getattr(pipeline, "_exclude_from_cpu_offload", [])
if "text_encoder" not in excl:
excl.append("text_encoder")
pipeline._exclude_from_cpu_offload = excl
for name, comp in pipeline.components.items():
if name == "text_encoder":
continue
if isinstance(comp, torch.nn.Module):
try:
comp.to("cuda")
except Exception:
pass
else:
pipeline.to("cuda")
elif gpu_mem <= 18:
print("GPU memory <= 18GB, using sequential cpu offload for low VRAM")
# The prompt requested sequential offloading without splitting layers for Nunchaku
pipeline._exclude_from_cpu_offload.append("transformer")
if nvfp4_encoder is not None:
# NVFP4 weights live entirely on CUDA; do not let accelerate move them.
pipeline._exclude_from_cpu_offload.append("text_encoder")
pipeline.enable_sequential_cpu_offload()
transformer.to("cuda")
if nvfp4_encoder is not None:
nvfp4_encoder.to("cuda")
else:
print("GPU memory > 18GB, using cpu offload")
if nvfp4_encoder is not None:
if not hasattr(pipeline, "_exclude_from_cpu_offload"):
pipeline._exclude_from_cpu_offload = []
pipeline._exclude_from_cpu_offload.append("text_encoder")
pipeline.enable_model_cpu_offload()
if nvfp4_encoder is not None:
nvfp4_encoder.to("cuda")
self.pipeline = pipeline
# Return twice for pipeline and edit_pipeline (though Z-Image-Turbo is T2I only)
return self.pipeline, self.pipeline