File size: 1,780 Bytes
1e103b7 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | import torch
import diffusers
try:
from sdnq import SDNQConfig
from sdnq.common import use_torch_compile as triton_is_available
from sdnq.loader import apply_sdnq_options_to_model
SDNQ_AVAILABLE = True
except ImportError:
print("SDNQ not found, optimized GLM loading will be skipped.")
SDNQ_AVAILABLE = False
class GlmBackend:
def __init__(self, model_id="Disty0/GLM-Image-SDNQ-4bit-dynamic"):
self.model_id = model_id
self.pipeline = None
def load(self):
print(f"Loading GLM backend from {self.model_id}...")
# Load the pipeline
# Using bfloat16 as per request snippet
pipeline = diffusers.GlmImagePipeline.from_pretrained(
self.model_id,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
if SDNQ_AVAILABLE:
# Enable INT8 MatMul for GPUs if Triton is available
if triton_is_available and (torch.cuda.is_available() or torch.xpu.is_available()):
print("Applying SDNQ optimizations (INT8 MatMul)...")
pipeline.transformer = apply_sdnq_options_to_model(pipeline.transformer, use_quantized_matmul=True)
# pipeline.transformer = torch.compile(pipeline.transformer) # Optional, commented out as in snippet
else:
print("Triton or CUDA/XPU not available, skipping SDNQ optimization.")
print("Enabling CPU offload for GLM pipeline...")
pipeline.enable_model_cpu_offload()
self.pipeline = pipeline
# The user stated: "this one uses same pipe line for image generation and editing"
# So we return the same pipeline for both.
return self.pipeline, self.pipeline
|