catplusplus's picture
Upload folder using huggingface_hub
1e103b7 verified
import torch
import diffusers
try:
from sdnq import SDNQConfig
from sdnq.common import use_torch_compile as triton_is_available
from sdnq.loader import apply_sdnq_options_to_model
SDNQ_AVAILABLE = True
except ImportError:
print("SDNQ not found, optimized GLM loading will be skipped.")
SDNQ_AVAILABLE = False
class GlmBackend:
def __init__(self, model_id="Disty0/GLM-Image-SDNQ-4bit-dynamic"):
self.model_id = model_id
self.pipeline = None
def load(self):
print(f"Loading GLM backend from {self.model_id}...")
# Load the pipeline
# Using bfloat16 as per request snippet
pipeline = diffusers.GlmImagePipeline.from_pretrained(
self.model_id,
torch_dtype=torch.bfloat16,
trust_remote_code=True,
)
if SDNQ_AVAILABLE:
# Enable INT8 MatMul for GPUs if Triton is available
if triton_is_available and (torch.cuda.is_available() or torch.xpu.is_available()):
print("Applying SDNQ optimizations (INT8 MatMul)...")
pipeline.transformer = apply_sdnq_options_to_model(pipeline.transformer, use_quantized_matmul=True)
# pipeline.transformer = torch.compile(pipeline.transformer) # Optional, commented out as in snippet
else:
print("Triton or CUDA/XPU not available, skipping SDNQ optimization.")
print("Enabling CPU offload for GLM pipeline...")
pipeline.enable_model_cpu_offload()
self.pipeline = pipeline
# The user stated: "this one uses same pipe line for image generation and editing"
# So we return the same pipeline for both.
return self.pipeline, self.pipeline