import torch import diffusers try: from sdnq import SDNQConfig from sdnq.common import use_torch_compile as triton_is_available from sdnq.loader import apply_sdnq_options_to_model SDNQ_AVAILABLE = True except ImportError: print("SDNQ not found, optimized GLM loading will be skipped.") SDNQ_AVAILABLE = False class GlmBackend: def __init__(self, model_id="Disty0/GLM-Image-SDNQ-4bit-dynamic"): self.model_id = model_id self.pipeline = None def load(self): print(f"Loading GLM backend from {self.model_id}...") # Load the pipeline # Using bfloat16 as per request snippet pipeline = diffusers.GlmImagePipeline.from_pretrained( self.model_id, torch_dtype=torch.bfloat16, trust_remote_code=True, ) if SDNQ_AVAILABLE: # Enable INT8 MatMul for GPUs if Triton is available if triton_is_available and (torch.cuda.is_available() or torch.xpu.is_available()): print("Applying SDNQ optimizations (INT8 MatMul)...") pipeline.transformer = apply_sdnq_options_to_model(pipeline.transformer, use_quantized_matmul=True) # pipeline.transformer = torch.compile(pipeline.transformer) # Optional, commented out as in snippet else: print("Triton or CUDA/XPU not available, skipping SDNQ optimization.") print("Enabling CPU offload for GLM pipeline...") pipeline.enable_model_cpu_offload() self.pipeline = pipeline # The user stated: "this one uses same pipe line for image generation and editing" # So we return the same pipeline for both. return self.pipeline, self.pipeline