"""ModelPatcher - Handles LoRA and weight patching for models.""" import copy import logging import uuid import torch from src.Device import Device from src.NeuralNetwork import unet from src.Utilities import util try: import tomesd import tomesd.patch TOMESD_AVAILABLE = True # Monkey-patch tomesd to support our transformer_options argument in _forward _original_make_tome_block = tomesd.patch.make_tome_block def _fixed_make_tome_block(block_class): cls = _original_make_tome_block(block_class) old_forward = cls._forward def new_forward(self, x, context=None, *args, **kwargs): return old_forward(self, x, context) cls._forward = new_forward return cls tomesd.patch.make_tome_block = _fixed_make_tome_block except ImportError: TOMESD_AVAILABLE = False tomesd = None def wipe_lowvram_weight(m): if hasattr(m, "prev_comfy_cast_weights"): m.comfy_cast_weights = m.prev_comfy_cast_weights del m.prev_comfy_cast_weights m.weight_function = m.bias_function = None class LowVramPatch: def __init__(self, key: str, model_patcher: "ModelPatcher"): self.key, self.model_patcher = key, model_patcher def __call__(self, weight: torch.Tensor) -> torch.Tensor: return self.model_patcher.calculate_weight(self.model_patcher.patches[self.key], weight, self.key) class ModelFunctionWrapperChain: """Compose multiple model_function_wrapper hooks without overwriting them. Several optimizations patch the same U-Net wrapper hook. Keeping only the last wrapper silently disables earlier optimizations. This chain preserves application order by making the most recently-added wrapper the outermost wrapper around the existing stack. """ def __init__(self, wrappers=None): self.wrappers = list(wrappers or []) def add_outer(self, wrapper): self.wrappers.insert(0, wrapper) return self def __call__(self, model_function, params): return self._invoke(0, model_function, params) def _invoke(self, index, model_function, params): if index >= len(self.wrappers): return model_function( params["input"], params["timestep"], **params.get("c", {}), ) wrapper = self.wrappers[index] def next_model_function(input_x, timestep, **c_kwargs): next_params = dict(params) next_params["input"] = input_x next_params["timestep"] = timestep next_params["c"] = c_kwargs return self._invoke(index + 1, model_function, next_params) return wrapper(next_model_function, params) def to(self, device): updated = [] for wrapper in self.wrappers: if hasattr(wrapper, "to"): moved = wrapper.to(device) updated.append(moved if moved is not None else wrapper) else: updated.append(wrapper) self.wrappers = updated return self class ModelPatcher: def __init__(self, model: torch.nn.Module, load_device: torch.device, offload_device: torch.device, size: int = 0, current_device: torch.device = None, weight_inplace_update: bool = False): self.size, self.model, self.patches, self.backup = size, model, {}, {} self.object_patches, self.object_patches_backup = {}, {} self.model_options = {"transformer_options": {}} self.model_size() self.load_device, self.offload_device = load_device, offload_device self.current_device = current_device or self.offload_device self.weight_inplace_update, self.model_lowvram, self.lowvram_patch_counter = weight_inplace_update, False, 0 self.patches_uuid = uuid.uuid4() self.tome_enabled, self.tome_ratio, self.tome_info = False, 0.5, {} for attr, default in [("model_loaded_weight_memory", 0), ("model_lowvram", False), ("lowvram_patch_counter", 0)]: if not hasattr(self.model, attr): setattr(self.model, attr, default) def named_modules(self): yield from self.model.diffusion_model.named_modules() if hasattr(self.model, 'diffusion_model') else iter([]) def loaded_size(self): return self.model.model_loaded_weight_memory def model_size(self) -> int: if self.size > 0: return self.size self.size = Device.module_size(self.model) self.model_keys = set(self.model.state_dict().keys()) return self.size def clone(self) -> "ModelPatcher": n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, self.weight_inplace_update) n.patches = {k: v[:] for k, v in self.patches.items()} n.patches_uuid, n.object_patches = self.patches_uuid, self.object_patches.copy() n.model_options, n.model_keys, n.backup = copy.deepcopy(self.model_options), self.model_keys, self.backup n.object_patches_backup = self.object_patches_backup return n def is_clone(self, other): return hasattr(other, "model") and self.model is other.model def memory_required(self, input_shape): return self.model.memory_required(input_shape=input_shape) def set_model_unet_function_wrapper(self, f): existing = self.model_options.get("model_function_wrapper") if existing is None: self.model_options["model_function_wrapper"] = f return if isinstance(existing, ModelFunctionWrapperChain): existing.add_outer(f) self.model_options["model_function_wrapper"] = existing return self.model_options["model_function_wrapper"] = ModelFunctionWrapperChain([f, existing]) def set_model_denoise_mask_function(self, f): self.model_options["denoise_mask_function"] = f def get_model_object(self, name): return util.get_attr(self.model, name) def model_patches_to(self, device): wrap_func = self.model_options.get("model_function_wrapper") if wrap_func and hasattr(wrap_func, "to"): self.model_options["model_function_wrapper"] = wrap_func.to(device) def model_dtype(self): return self.model.get_dtype() if hasattr(self.model, "get_dtype") else None def add_patches(self, patches, strength_patch=1.0, strength_model=1.0): p = set() for k in patches: if k in self.model_keys: p.add(k) self.patches[k] = self.patches.get(k, []) + [(strength_patch, patches[k], strength_model)] self.patches_uuid = uuid.uuid4() return list(p) def set_model_patch(self, patch, name): to = self.model_options["transformer_options"] to.setdefault("patches", {})[name] = to.get("patches", {}).get(name, []) + [patch] def set_model_attn1_patch(self, patch): self.set_model_patch(patch, "attn1_patch") def set_model_attn2_patch(self, patch): self.set_model_patch(patch, "attn2_patch") def set_model_attn1_output_patch(self, patch): self.set_model_patch(patch, "attn1_output_patch") def set_model_attn2_output_patch(self, patch): self.set_model_patch(patch, "attn2_output_patch") def model_state_dict(self, filter_prefix=None): return self.model.state_dict() def patch_weight_to_device(self, key, device_to=None): if key not in self.patches: return weight = util.get_attr(self.model, key) if key not in self.backup: self.backup[key] = weight.to(device=self.offload_device, copy=self.weight_inplace_update) temp_weight = Device.cast_to_device(weight, device_to, torch.float32, copy=True) if device_to else weight.to(torch.float32, copy=True) out_weight = self.calculate_weight(self.patches[key], temp_weight, key).to(weight.dtype) (util.copy_to_param if self.weight_inplace_update else util.set_attr_param)(self.model, key, out_weight) def weight_only_quantize(self, dtype: torch.dtype | str = torch.float8_e4m3fn): """Quantize all model weights to the target dtype or format (weight-only).""" if isinstance(dtype, str): format_name = dtype.lower() else: format_name = str(dtype) logging.info(f"Quantizing model weights to {format_name}") with torch.no_grad(): for n, m in self.model.named_modules(): if hasattr(m, "weight") and m.weight is not None: # Don't quantize small tensors or non-float weights if m.weight.numel() > 4096 and m.weight.is_floating_point() and m.weight.ndim == 2: if format_name == "nvfp4": from src.Utilities.Quantization import quantize_nvfp4, from_blocked orig_shape = m.weight.shape q_weight, tensor_scale, blocked_scales = quantize_nvfp4(m.weight) m.weight = torch.nn.Parameter(q_weight, requires_grad=False) m.quant_format = "nvfp4" # Register as buffers so they move with the model # (They are automatically handled by CastWeightBiasOp via getattr) m.register_buffer("weight_scale_2", tensor_scale) # Pre-de-block scales to save compute during inference rows, cols = orig_shape block_cols = (cols + 15) // 16 deblocked_scales = from_blocked(blocked_scales, rows, block_cols) m.register_buffer("weight_scale", deblocked_scales) m.original_shape = orig_shape m.comfy_cast_weights = True else: q_weight = m.weight.to(dtype) # We keep it as a Parameter so it can be used in forward m.weight = torch.nn.Parameter(q_weight, requires_grad=False) # Enable weight casting so it dequantizes to input dtype on the fly if hasattr(m, "comfy_cast_weights"): m.comfy_cast_weights = True if hasattr(m, "bias") and m.bias is not None: # Biases are usually kept in higher precision pass def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False): mem_counter, patch_counter, lowvram_counter = 0, 0, 0 loading = sorted([(Device.module_size(m), n, m) for n, m in self.model.named_modules() if hasattr(m, "comfy_cast_weights") or hasattr(m, "weight")], reverse=True) load_completely = [] for module_mem, n, m in loading: lowvram_weight = not full_load and hasattr(m, "comfy_cast_weights") and mem_counter + module_mem >= lowvram_model_memory if lowvram_weight: lowvram_counter += 1 if hasattr(m, "prev_comfy_cast_weights"): continue if force_patch_weights: for pkey in [f"{n}.weight", f"{n}.bias"]: if pkey in self.patches: self.patch_weight_to_device(pkey) m.prev_comfy_cast_weights, m.comfy_cast_weights = m.comfy_cast_weights, True else: if hasattr(m, "comfy_cast_weights") and m.comfy_cast_weights: wipe_lowvram_weight(m) if hasattr(m, "weight"): mem_counter += module_mem load_completely.append((module_mem, n, m)) for _, n, m in sorted(load_completely, reverse=True): if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights: continue self.patch_weight_to_device(f"{n}.weight", device_to) self.patch_weight_to_device(f"{n}.bias", device_to) m.comfy_patched_weights = True for _, _, m in load_completely: m.to(device_to) if lowvram_counter > 0: logging.info(f"loaded partially {lowvram_model_memory / 1e6:.1f} {mem_counter / 1e6:.1f} {patch_counter}") self.model.model_lowvram = True else: logging.info(f"loaded completely {lowvram_model_memory / 1e6:.1f} {mem_counter / 1e6:.1f} {full_load}") self.model.model_lowvram = False if full_load: self.model.to(device_to) mem_counter = self.model_size() self.model.lowvram_patch_counter += patch_counter self.model.device, self.model.model_loaded_weight_memory = device_to, mem_counter def _apply_object_patches(self): for k in self.object_patches: old = util.set_attr(self.model, k, self.object_patches[k]) if k not in self.object_patches_backup: self.object_patches_backup[k] = old def patch_model_flux(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False): self._apply_object_patches() if load_weights: self.load(device_to, lowvram_model_memory, force_patch_weights, full_load=lowvram_model_memory == 0) return self.model def patch_model_lowvram_flux(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False): return self._patch_model_lowvram_impl(device_to, lowvram_model_memory, force_patch_weights) def patch_model(self, device_to=None, patch_weights=True): self._apply_object_patches() if patch_weights: for key in self.patches: if key not in self.model.state_dict(): logging.warning(f"could not patch. key doesn't exist in model: {key}") continue self.patch_weight_to_device(key, device_to) if device_to: self.model.to(device_to) self.current_device = device_to return self.model def patch_model_lowvram(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False): return self._patch_model_lowvram_impl(device_to, lowvram_model_memory, force_patch_weights) def _patch_model_lowvram_impl(self, device_to, lowvram_model_memory, force_patch_weights): self.patch_model(device_to, patch_weights=False) logging.info(f"loading in lowvram mode {lowvram_model_memory / 1e6:.1f}") mem_counter, patch_counter = 0, 0 for n, m in self.model.named_modules(): lowvram_weight = hasattr(m, "comfy_cast_weights") and mem_counter + Device.module_size(m) >= lowvram_model_memory if lowvram_weight: for pkey in [f"{n}.weight", f"{n}.bias"]: if pkey in self.patches: if force_patch_weights: self.patch_weight_to_device(pkey) else: setattr(m, 'weight_function' if 'weight' in pkey else 'bias_function', LowVramPatch(pkey, self)) patch_counter += 1 m.prev_comfy_cast_weights, m.comfy_cast_weights = m.comfy_cast_weights, True elif hasattr(m, "weight"): self.patch_weight_to_device(f"{n}.weight", device_to) self.patch_weight_to_device(f"{n}.bias", device_to) m.to(device_to) mem_counter += Device.module_size(m) self.model_lowvram, self.lowvram_patch_counter = True, patch_counter return self.model def calculate_weight(self, patches, weight, key): for p in patches: alpha, v = p[0], p[1][1] mat1 = Device.cast_to_device(v[0], weight.device, torch.float32) mat2 = Device.cast_to_device(v[1], weight.device, torch.float32) if v[2] is not None: alpha *= v[2] / mat2.shape[0] patch_shape = (mat1.shape[0], mat2.shape[1]) if patch_shape != weight.shape: # Handle cases where weight might be flattened but patch is not, or vice versa if mat1.flatten(start_dim=1).shape[0] * mat2.flatten(start_dim=1).shape[1] != weight.numel(): logging.warning(f"Skipping patch for {key}: shape mismatch. Weight: {weight.shape}, Patch: {patch_shape}") continue try: weight += (alpha * torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1))).reshape(weight.shape).type(weight.dtype) except Exception as e: logging.error(f"Failed to apply patch for {key}: {e}") continue return weight def unpatch_model(self, device_to=None, unpatch_weights=True): if unpatch_weights: for k in list(self.backup.keys()): util.set_attr_param(self.model, k, self.backup[k]) self.backup.clear() if device_to: self.model.to(device_to) self.current_device = device_to self.object_patches_backup.clear() def partially_load(self, device_to, extra_memory=0): self.unpatch_model(unpatch_weights=False) self.patch_model(patch_weights=False) if not self.model.model_lowvram: return 0 full_load = self.model.model_loaded_weight_memory + extra_memory > self.model_size() current_used = self.model.model_loaded_weight_memory self.load(device_to, lowvram_model_memory=current_used + extra_memory, full_load=full_load) return self.model.model_loaded_weight_memory - current_used def add_object_patch(self, name, obj): self.object_patches[name] = obj def apply_tome(self, ratio=0.5, max_downsample=1): if not TOMESD_AVAILABLE: logging.warning("Token Merging (tomesd) not available") return False try: tomesd.remove_patch(self) except: pass self.tome_enabled, self.tome_ratio = False, 0.5 try: if hasattr(self.model, 'diffusion_model'): tomesd.apply_patch(self, ratio=ratio, max_downsample=max_downsample) self.tome_enabled, self.tome_ratio = True, ratio logging.info(f"Applied Token Merging with ratio={ratio}, max_downsample={max_downsample}") return True return False except Exception as e: logging.error(f"Failed to apply Token Merging: {e}") return False def remove_tome(self): if not TOMESD_AVAILABLE or not self.tome_enabled: return False try: tomesd.remove_patch(self) self.tome_enabled, self.tome_ratio, self.tome_info = False, 0.5, {} return True except Exception as e: logging.error(f"Failed to remove Token Merging: {e}") return False def unet_prefix_from_state_dict(state_dict): counts = {c: sum(1 for k in state_dict if k.startswith(c)) for c in ["model.diffusion_model.", "model.model."]} top = max(counts, key=counts.get) return top if counts[top] > 5 else "model." def load_diffusion_model_state_dict(sd, model_options={}): dtype = model_options.get("dtype") prefix = unet_prefix_from_state_dict(sd) temp_sd = util.state_dict_prefix_replace(sd, {prefix: ""}, filter_keys=True) if len(temp_sd) > 0: sd = temp_sd parameters, load_device = util.calculate_parameters(sd), Device.get_torch_device() model_config = unet.model_config_from_unet(sd, "") offload_device = Device.unet_offload_device() unet_dtype2 = dtype or Device.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes) manual_cast_dtype = Device.unet_manual_cast(unet_dtype2, load_device, model_config.supported_inference_dtypes) model_config.set_inference_dtype(unet_dtype2, manual_cast_dtype) model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations) model = model_config.get_model(sd, "").to(offload_device) model.load_model_weights(sd, "") return ModelPatcher(model, load_device=load_device, offload_device=offload_device)