LightDiffusion-Next / src /Model /ModelPatcher.py
Aatricks's picture
Deploy ZeroGPU Gradio Space snapshot
b701455
"""ModelPatcher - Handles LoRA and weight patching for models."""
import copy
import logging
import uuid
import torch
from src.Device import Device
from src.NeuralNetwork import unet
from src.Utilities import util
try:
import tomesd
import tomesd.patch
TOMESD_AVAILABLE = True
# Monkey-patch tomesd to support our transformer_options argument in _forward
_original_make_tome_block = tomesd.patch.make_tome_block
def _fixed_make_tome_block(block_class):
cls = _original_make_tome_block(block_class)
old_forward = cls._forward
def new_forward(self, x, context=None, *args, **kwargs):
return old_forward(self, x, context)
cls._forward = new_forward
return cls
tomesd.patch.make_tome_block = _fixed_make_tome_block
except ImportError:
TOMESD_AVAILABLE = False
tomesd = None
def wipe_lowvram_weight(m):
if hasattr(m, "prev_comfy_cast_weights"):
m.comfy_cast_weights = m.prev_comfy_cast_weights
del m.prev_comfy_cast_weights
m.weight_function = m.bias_function = None
class LowVramPatch:
def __init__(self, key: str, model_patcher: "ModelPatcher"):
self.key, self.model_patcher = key, model_patcher
def __call__(self, weight: torch.Tensor) -> torch.Tensor:
return self.model_patcher.calculate_weight(self.model_patcher.patches[self.key], weight, self.key)
class ModelFunctionWrapperChain:
"""Compose multiple model_function_wrapper hooks without overwriting them.
Several optimizations patch the same U-Net wrapper hook. Keeping only the
last wrapper silently disables earlier optimizations. This chain preserves
application order by making the most recently-added wrapper the outermost
wrapper around the existing stack.
"""
def __init__(self, wrappers=None):
self.wrappers = list(wrappers or [])
def add_outer(self, wrapper):
self.wrappers.insert(0, wrapper)
return self
def __call__(self, model_function, params):
return self._invoke(0, model_function, params)
def _invoke(self, index, model_function, params):
if index >= len(self.wrappers):
return model_function(
params["input"],
params["timestep"],
**params.get("c", {}),
)
wrapper = self.wrappers[index]
def next_model_function(input_x, timestep, **c_kwargs):
next_params = dict(params)
next_params["input"] = input_x
next_params["timestep"] = timestep
next_params["c"] = c_kwargs
return self._invoke(index + 1, model_function, next_params)
return wrapper(next_model_function, params)
def to(self, device):
updated = []
for wrapper in self.wrappers:
if hasattr(wrapper, "to"):
moved = wrapper.to(device)
updated.append(moved if moved is not None else wrapper)
else:
updated.append(wrapper)
self.wrappers = updated
return self
class ModelPatcher:
def __init__(self, model: torch.nn.Module, load_device: torch.device, offload_device: torch.device,
size: int = 0, current_device: torch.device = None, weight_inplace_update: bool = False):
self.size, self.model, self.patches, self.backup = size, model, {}, {}
self.object_patches, self.object_patches_backup = {}, {}
self.model_options = {"transformer_options": {}}
self.model_size()
self.load_device, self.offload_device = load_device, offload_device
self.current_device = current_device or self.offload_device
self.weight_inplace_update, self.model_lowvram, self.lowvram_patch_counter = weight_inplace_update, False, 0
self.patches_uuid = uuid.uuid4()
self.tome_enabled, self.tome_ratio, self.tome_info = False, 0.5, {}
for attr, default in [("model_loaded_weight_memory", 0), ("model_lowvram", False), ("lowvram_patch_counter", 0)]:
if not hasattr(self.model, attr):
setattr(self.model, attr, default)
def named_modules(self):
yield from self.model.diffusion_model.named_modules() if hasattr(self.model, 'diffusion_model') else iter([])
def loaded_size(self):
return self.model.model_loaded_weight_memory
def model_size(self) -> int:
if self.size > 0:
return self.size
self.size = Device.module_size(self.model)
self.model_keys = set(self.model.state_dict().keys())
return self.size
def clone(self) -> "ModelPatcher":
n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, self.weight_inplace_update)
n.patches = {k: v[:] for k, v in self.patches.items()}
n.patches_uuid, n.object_patches = self.patches_uuid, self.object_patches.copy()
n.model_options, n.model_keys, n.backup = copy.deepcopy(self.model_options), self.model_keys, self.backup
n.object_patches_backup = self.object_patches_backup
return n
def is_clone(self, other):
return hasattr(other, "model") and self.model is other.model
def memory_required(self, input_shape):
return self.model.memory_required(input_shape=input_shape)
def set_model_unet_function_wrapper(self, f):
existing = self.model_options.get("model_function_wrapper")
if existing is None:
self.model_options["model_function_wrapper"] = f
return
if isinstance(existing, ModelFunctionWrapperChain):
existing.add_outer(f)
self.model_options["model_function_wrapper"] = existing
return
self.model_options["model_function_wrapper"] = ModelFunctionWrapperChain([f, existing])
def set_model_denoise_mask_function(self, f):
self.model_options["denoise_mask_function"] = f
def get_model_object(self, name):
return util.get_attr(self.model, name)
def model_patches_to(self, device):
wrap_func = self.model_options.get("model_function_wrapper")
if wrap_func and hasattr(wrap_func, "to"):
self.model_options["model_function_wrapper"] = wrap_func.to(device)
def model_dtype(self):
return self.model.get_dtype() if hasattr(self.model, "get_dtype") else None
def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
p = set()
for k in patches:
if k in self.model_keys:
p.add(k)
self.patches[k] = self.patches.get(k, []) + [(strength_patch, patches[k], strength_model)]
self.patches_uuid = uuid.uuid4()
return list(p)
def set_model_patch(self, patch, name):
to = self.model_options["transformer_options"]
to.setdefault("patches", {})[name] = to.get("patches", {}).get(name, []) + [patch]
def set_model_attn1_patch(self, patch):
self.set_model_patch(patch, "attn1_patch")
def set_model_attn2_patch(self, patch):
self.set_model_patch(patch, "attn2_patch")
def set_model_attn1_output_patch(self, patch):
self.set_model_patch(patch, "attn1_output_patch")
def set_model_attn2_output_patch(self, patch):
self.set_model_patch(patch, "attn2_output_patch")
def model_state_dict(self, filter_prefix=None):
return self.model.state_dict()
def patch_weight_to_device(self, key, device_to=None):
if key not in self.patches:
return
weight = util.get_attr(self.model, key)
if key not in self.backup:
self.backup[key] = weight.to(device=self.offload_device, copy=self.weight_inplace_update)
temp_weight = Device.cast_to_device(weight, device_to, torch.float32, copy=True) if device_to else weight.to(torch.float32, copy=True)
out_weight = self.calculate_weight(self.patches[key], temp_weight, key).to(weight.dtype)
(util.copy_to_param if self.weight_inplace_update else util.set_attr_param)(self.model, key, out_weight)
def weight_only_quantize(self, dtype: torch.dtype | str = torch.float8_e4m3fn):
"""Quantize all model weights to the target dtype or format (weight-only)."""
if isinstance(dtype, str):
format_name = dtype.lower()
else:
format_name = str(dtype)
logging.info(f"Quantizing model weights to {format_name}")
with torch.no_grad():
for n, m in self.model.named_modules():
if hasattr(m, "weight") and m.weight is not None:
# Don't quantize small tensors or non-float weights
if m.weight.numel() > 4096 and m.weight.is_floating_point() and m.weight.ndim == 2:
if format_name == "nvfp4":
from src.Utilities.Quantization import quantize_nvfp4, from_blocked
orig_shape = m.weight.shape
q_weight, tensor_scale, blocked_scales = quantize_nvfp4(m.weight)
m.weight = torch.nn.Parameter(q_weight, requires_grad=False)
m.quant_format = "nvfp4"
# Register as buffers so they move with the model
# (They are automatically handled by CastWeightBiasOp via getattr)
m.register_buffer("weight_scale_2", tensor_scale)
# Pre-de-block scales to save compute during inference
rows, cols = orig_shape
block_cols = (cols + 15) // 16
deblocked_scales = from_blocked(blocked_scales, rows, block_cols)
m.register_buffer("weight_scale", deblocked_scales)
m.original_shape = orig_shape
m.comfy_cast_weights = True
else:
q_weight = m.weight.to(dtype)
# We keep it as a Parameter so it can be used in forward
m.weight = torch.nn.Parameter(q_weight, requires_grad=False)
# Enable weight casting so it dequantizes to input dtype on the fly
if hasattr(m, "comfy_cast_weights"):
m.comfy_cast_weights = True
if hasattr(m, "bias") and m.bias is not None:
# Biases are usually kept in higher precision
pass
def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
mem_counter, patch_counter, lowvram_counter = 0, 0, 0
loading = sorted([(Device.module_size(m), n, m) for n, m in self.model.named_modules()
if hasattr(m, "comfy_cast_weights") or hasattr(m, "weight")], reverse=True)
load_completely = []
for module_mem, n, m in loading:
lowvram_weight = not full_load and hasattr(m, "comfy_cast_weights") and mem_counter + module_mem >= lowvram_model_memory
if lowvram_weight:
lowvram_counter += 1
if hasattr(m, "prev_comfy_cast_weights"):
continue
if force_patch_weights:
for pkey in [f"{n}.weight", f"{n}.bias"]:
if pkey in self.patches:
self.patch_weight_to_device(pkey)
m.prev_comfy_cast_weights, m.comfy_cast_weights = m.comfy_cast_weights, True
else:
if hasattr(m, "comfy_cast_weights") and m.comfy_cast_weights:
wipe_lowvram_weight(m)
if hasattr(m, "weight"):
mem_counter += module_mem
load_completely.append((module_mem, n, m))
for _, n, m in sorted(load_completely, reverse=True):
if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights:
continue
self.patch_weight_to_device(f"{n}.weight", device_to)
self.patch_weight_to_device(f"{n}.bias", device_to)
m.comfy_patched_weights = True
for _, _, m in load_completely:
m.to(device_to)
if lowvram_counter > 0:
logging.info(f"loaded partially {lowvram_model_memory / 1e6:.1f} {mem_counter / 1e6:.1f} {patch_counter}")
self.model.model_lowvram = True
else:
logging.info(f"loaded completely {lowvram_model_memory / 1e6:.1f} {mem_counter / 1e6:.1f} {full_load}")
self.model.model_lowvram = False
if full_load:
self.model.to(device_to)
mem_counter = self.model_size()
self.model.lowvram_patch_counter += patch_counter
self.model.device, self.model.model_loaded_weight_memory = device_to, mem_counter
def _apply_object_patches(self):
for k in self.object_patches:
old = util.set_attr(self.model, k, self.object_patches[k])
if k not in self.object_patches_backup:
self.object_patches_backup[k] = old
def patch_model_flux(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
self._apply_object_patches()
if load_weights:
self.load(device_to, lowvram_model_memory, force_patch_weights, full_load=lowvram_model_memory == 0)
return self.model
def patch_model_lowvram_flux(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False):
return self._patch_model_lowvram_impl(device_to, lowvram_model_memory, force_patch_weights)
def patch_model(self, device_to=None, patch_weights=True):
self._apply_object_patches()
if patch_weights:
for key in self.patches:
if key not in self.model.state_dict():
logging.warning(f"could not patch. key doesn't exist in model: {key}")
continue
self.patch_weight_to_device(key, device_to)
if device_to:
self.model.to(device_to)
self.current_device = device_to
return self.model
def patch_model_lowvram(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False):
return self._patch_model_lowvram_impl(device_to, lowvram_model_memory, force_patch_weights)
def _patch_model_lowvram_impl(self, device_to, lowvram_model_memory, force_patch_weights):
self.patch_model(device_to, patch_weights=False)
logging.info(f"loading in lowvram mode {lowvram_model_memory / 1e6:.1f}")
mem_counter, patch_counter = 0, 0
for n, m in self.model.named_modules():
lowvram_weight = hasattr(m, "comfy_cast_weights") and mem_counter + Device.module_size(m) >= lowvram_model_memory
if lowvram_weight:
for pkey in [f"{n}.weight", f"{n}.bias"]:
if pkey in self.patches:
if force_patch_weights:
self.patch_weight_to_device(pkey)
else:
setattr(m, 'weight_function' if 'weight' in pkey else 'bias_function', LowVramPatch(pkey, self))
patch_counter += 1
m.prev_comfy_cast_weights, m.comfy_cast_weights = m.comfy_cast_weights, True
elif hasattr(m, "weight"):
self.patch_weight_to_device(f"{n}.weight", device_to)
self.patch_weight_to_device(f"{n}.bias", device_to)
m.to(device_to)
mem_counter += Device.module_size(m)
self.model_lowvram, self.lowvram_patch_counter = True, patch_counter
return self.model
def calculate_weight(self, patches, weight, key):
for p in patches:
alpha, v = p[0], p[1][1]
mat1 = Device.cast_to_device(v[0], weight.device, torch.float32)
mat2 = Device.cast_to_device(v[1], weight.device, torch.float32)
if v[2] is not None:
alpha *= v[2] / mat2.shape[0]
patch_shape = (mat1.shape[0], mat2.shape[1])
if patch_shape != weight.shape:
# Handle cases where weight might be flattened but patch is not, or vice versa
if mat1.flatten(start_dim=1).shape[0] * mat2.flatten(start_dim=1).shape[1] != weight.numel():
logging.warning(f"Skipping patch for {key}: shape mismatch. Weight: {weight.shape}, Patch: {patch_shape}")
continue
try:
weight += (alpha * torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1))).reshape(weight.shape).type(weight.dtype)
except Exception as e:
logging.error(f"Failed to apply patch for {key}: {e}")
continue
return weight
def unpatch_model(self, device_to=None, unpatch_weights=True):
if unpatch_weights:
for k in list(self.backup.keys()):
util.set_attr_param(self.model, k, self.backup[k])
self.backup.clear()
if device_to:
self.model.to(device_to)
self.current_device = device_to
self.object_patches_backup.clear()
def partially_load(self, device_to, extra_memory=0):
self.unpatch_model(unpatch_weights=False)
self.patch_model(patch_weights=False)
if not self.model.model_lowvram:
return 0
full_load = self.model.model_loaded_weight_memory + extra_memory > self.model_size()
current_used = self.model.model_loaded_weight_memory
self.load(device_to, lowvram_model_memory=current_used + extra_memory, full_load=full_load)
return self.model.model_loaded_weight_memory - current_used
def add_object_patch(self, name, obj):
self.object_patches[name] = obj
def apply_tome(self, ratio=0.5, max_downsample=1):
if not TOMESD_AVAILABLE:
logging.warning("Token Merging (tomesd) not available")
return False
try:
tomesd.remove_patch(self)
except:
pass
self.tome_enabled, self.tome_ratio = False, 0.5
try:
if hasattr(self.model, 'diffusion_model'):
tomesd.apply_patch(self, ratio=ratio, max_downsample=max_downsample)
self.tome_enabled, self.tome_ratio = True, ratio
logging.info(f"Applied Token Merging with ratio={ratio}, max_downsample={max_downsample}")
return True
return False
except Exception as e:
logging.error(f"Failed to apply Token Merging: {e}")
return False
def remove_tome(self):
if not TOMESD_AVAILABLE or not self.tome_enabled:
return False
try:
tomesd.remove_patch(self)
self.tome_enabled, self.tome_ratio, self.tome_info = False, 0.5, {}
return True
except Exception as e:
logging.error(f"Failed to remove Token Merging: {e}")
return False
def unet_prefix_from_state_dict(state_dict):
counts = {c: sum(1 for k in state_dict if k.startswith(c)) for c in ["model.diffusion_model.", "model.model."]}
top = max(counts, key=counts.get)
return top if counts[top] > 5 else "model."
def load_diffusion_model_state_dict(sd, model_options={}):
dtype = model_options.get("dtype")
prefix = unet_prefix_from_state_dict(sd)
temp_sd = util.state_dict_prefix_replace(sd, {prefix: ""}, filter_keys=True)
if len(temp_sd) > 0:
sd = temp_sd
parameters, load_device = util.calculate_parameters(sd), Device.get_torch_device()
model_config = unet.model_config_from_unet(sd, "")
offload_device = Device.unet_offload_device()
unet_dtype2 = dtype or Device.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
manual_cast_dtype = Device.unet_manual_cast(unet_dtype2, load_device, model_config.supported_inference_dtypes)
model_config.set_inference_dtype(unet_dtype2, manual_cast_dtype)
model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations)
model = model_config.get_model(sd, "").to(offload_device)
model.load_model_weights(sd, "")
return ModelPatcher(model, load_device=load_device, offload_device=offload_device)