Spaces:
Running on Zero
Running on Zero
| """ModelPatcher - Handles LoRA and weight patching for models.""" | |
| import copy | |
| import logging | |
| import uuid | |
| import torch | |
| from src.Device import Device | |
| from src.NeuralNetwork import unet | |
| from src.Utilities import util | |
| try: | |
| import tomesd | |
| import tomesd.patch | |
| TOMESD_AVAILABLE = True | |
| # Monkey-patch tomesd to support our transformer_options argument in _forward | |
| _original_make_tome_block = tomesd.patch.make_tome_block | |
| def _fixed_make_tome_block(block_class): | |
| cls = _original_make_tome_block(block_class) | |
| old_forward = cls._forward | |
| def new_forward(self, x, context=None, *args, **kwargs): | |
| return old_forward(self, x, context) | |
| cls._forward = new_forward | |
| return cls | |
| tomesd.patch.make_tome_block = _fixed_make_tome_block | |
| except ImportError: | |
| TOMESD_AVAILABLE = False | |
| tomesd = None | |
| def wipe_lowvram_weight(m): | |
| if hasattr(m, "prev_comfy_cast_weights"): | |
| m.comfy_cast_weights = m.prev_comfy_cast_weights | |
| del m.prev_comfy_cast_weights | |
| m.weight_function = m.bias_function = None | |
| class LowVramPatch: | |
| def __init__(self, key: str, model_patcher: "ModelPatcher"): | |
| self.key, self.model_patcher = key, model_patcher | |
| def __call__(self, weight: torch.Tensor) -> torch.Tensor: | |
| return self.model_patcher.calculate_weight(self.model_patcher.patches[self.key], weight, self.key) | |
| class ModelFunctionWrapperChain: | |
| """Compose multiple model_function_wrapper hooks without overwriting them. | |
| Several optimizations patch the same U-Net wrapper hook. Keeping only the | |
| last wrapper silently disables earlier optimizations. This chain preserves | |
| application order by making the most recently-added wrapper the outermost | |
| wrapper around the existing stack. | |
| """ | |
| def __init__(self, wrappers=None): | |
| self.wrappers = list(wrappers or []) | |
| def add_outer(self, wrapper): | |
| self.wrappers.insert(0, wrapper) | |
| return self | |
| def __call__(self, model_function, params): | |
| return self._invoke(0, model_function, params) | |
| def _invoke(self, index, model_function, params): | |
| if index >= len(self.wrappers): | |
| return model_function( | |
| params["input"], | |
| params["timestep"], | |
| **params.get("c", {}), | |
| ) | |
| wrapper = self.wrappers[index] | |
| def next_model_function(input_x, timestep, **c_kwargs): | |
| next_params = dict(params) | |
| next_params["input"] = input_x | |
| next_params["timestep"] = timestep | |
| next_params["c"] = c_kwargs | |
| return self._invoke(index + 1, model_function, next_params) | |
| return wrapper(next_model_function, params) | |
| def to(self, device): | |
| updated = [] | |
| for wrapper in self.wrappers: | |
| if hasattr(wrapper, "to"): | |
| moved = wrapper.to(device) | |
| updated.append(moved if moved is not None else wrapper) | |
| else: | |
| updated.append(wrapper) | |
| self.wrappers = updated | |
| return self | |
| class ModelPatcher: | |
| def __init__(self, model: torch.nn.Module, load_device: torch.device, offload_device: torch.device, | |
| size: int = 0, current_device: torch.device = None, weight_inplace_update: bool = False): | |
| self.size, self.model, self.patches, self.backup = size, model, {}, {} | |
| self.object_patches, self.object_patches_backup = {}, {} | |
| self.model_options = {"transformer_options": {}} | |
| self.model_size() | |
| self.load_device, self.offload_device = load_device, offload_device | |
| self.current_device = current_device or self.offload_device | |
| self.weight_inplace_update, self.model_lowvram, self.lowvram_patch_counter = weight_inplace_update, False, 0 | |
| self.patches_uuid = uuid.uuid4() | |
| self.tome_enabled, self.tome_ratio, self.tome_info = False, 0.5, {} | |
| for attr, default in [("model_loaded_weight_memory", 0), ("model_lowvram", False), ("lowvram_patch_counter", 0)]: | |
| if not hasattr(self.model, attr): | |
| setattr(self.model, attr, default) | |
| def named_modules(self): | |
| yield from self.model.diffusion_model.named_modules() if hasattr(self.model, 'diffusion_model') else iter([]) | |
| def loaded_size(self): | |
| return self.model.model_loaded_weight_memory | |
| def model_size(self) -> int: | |
| if self.size > 0: | |
| return self.size | |
| self.size = Device.module_size(self.model) | |
| self.model_keys = set(self.model.state_dict().keys()) | |
| return self.size | |
| def clone(self) -> "ModelPatcher": | |
| n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, self.weight_inplace_update) | |
| n.patches = {k: v[:] for k, v in self.patches.items()} | |
| n.patches_uuid, n.object_patches = self.patches_uuid, self.object_patches.copy() | |
| n.model_options, n.model_keys, n.backup = copy.deepcopy(self.model_options), self.model_keys, self.backup | |
| n.object_patches_backup = self.object_patches_backup | |
| return n | |
| def is_clone(self, other): | |
| return hasattr(other, "model") and self.model is other.model | |
| def memory_required(self, input_shape): | |
| return self.model.memory_required(input_shape=input_shape) | |
| def set_model_unet_function_wrapper(self, f): | |
| existing = self.model_options.get("model_function_wrapper") | |
| if existing is None: | |
| self.model_options["model_function_wrapper"] = f | |
| return | |
| if isinstance(existing, ModelFunctionWrapperChain): | |
| existing.add_outer(f) | |
| self.model_options["model_function_wrapper"] = existing | |
| return | |
| self.model_options["model_function_wrapper"] = ModelFunctionWrapperChain([f, existing]) | |
| def set_model_denoise_mask_function(self, f): | |
| self.model_options["denoise_mask_function"] = f | |
| def get_model_object(self, name): | |
| return util.get_attr(self.model, name) | |
| def model_patches_to(self, device): | |
| wrap_func = self.model_options.get("model_function_wrapper") | |
| if wrap_func and hasattr(wrap_func, "to"): | |
| self.model_options["model_function_wrapper"] = wrap_func.to(device) | |
| def model_dtype(self): | |
| return self.model.get_dtype() if hasattr(self.model, "get_dtype") else None | |
| def add_patches(self, patches, strength_patch=1.0, strength_model=1.0): | |
| p = set() | |
| for k in patches: | |
| if k in self.model_keys: | |
| p.add(k) | |
| self.patches[k] = self.patches.get(k, []) + [(strength_patch, patches[k], strength_model)] | |
| self.patches_uuid = uuid.uuid4() | |
| return list(p) | |
| def set_model_patch(self, patch, name): | |
| to = self.model_options["transformer_options"] | |
| to.setdefault("patches", {})[name] = to.get("patches", {}).get(name, []) + [patch] | |
| def set_model_attn1_patch(self, patch): | |
| self.set_model_patch(patch, "attn1_patch") | |
| def set_model_attn2_patch(self, patch): | |
| self.set_model_patch(patch, "attn2_patch") | |
| def set_model_attn1_output_patch(self, patch): | |
| self.set_model_patch(patch, "attn1_output_patch") | |
| def set_model_attn2_output_patch(self, patch): | |
| self.set_model_patch(patch, "attn2_output_patch") | |
| def model_state_dict(self, filter_prefix=None): | |
| return self.model.state_dict() | |
| def patch_weight_to_device(self, key, device_to=None): | |
| if key not in self.patches: | |
| return | |
| weight = util.get_attr(self.model, key) | |
| if key not in self.backup: | |
| self.backup[key] = weight.to(device=self.offload_device, copy=self.weight_inplace_update) | |
| temp_weight = Device.cast_to_device(weight, device_to, torch.float32, copy=True) if device_to else weight.to(torch.float32, copy=True) | |
| out_weight = self.calculate_weight(self.patches[key], temp_weight, key).to(weight.dtype) | |
| (util.copy_to_param if self.weight_inplace_update else util.set_attr_param)(self.model, key, out_weight) | |
| def weight_only_quantize(self, dtype: torch.dtype | str = torch.float8_e4m3fn): | |
| """Quantize all model weights to the target dtype or format (weight-only).""" | |
| if isinstance(dtype, str): | |
| format_name = dtype.lower() | |
| else: | |
| format_name = str(dtype) | |
| logging.info(f"Quantizing model weights to {format_name}") | |
| with torch.no_grad(): | |
| for n, m in self.model.named_modules(): | |
| if hasattr(m, "weight") and m.weight is not None: | |
| # Don't quantize small tensors or non-float weights | |
| if m.weight.numel() > 4096 and m.weight.is_floating_point() and m.weight.ndim == 2: | |
| if format_name == "nvfp4": | |
| from src.Utilities.Quantization import quantize_nvfp4, from_blocked | |
| orig_shape = m.weight.shape | |
| q_weight, tensor_scale, blocked_scales = quantize_nvfp4(m.weight) | |
| m.weight = torch.nn.Parameter(q_weight, requires_grad=False) | |
| m.quant_format = "nvfp4" | |
| # Register as buffers so they move with the model | |
| # (They are automatically handled by CastWeightBiasOp via getattr) | |
| m.register_buffer("weight_scale_2", tensor_scale) | |
| # Pre-de-block scales to save compute during inference | |
| rows, cols = orig_shape | |
| block_cols = (cols + 15) // 16 | |
| deblocked_scales = from_blocked(blocked_scales, rows, block_cols) | |
| m.register_buffer("weight_scale", deblocked_scales) | |
| m.original_shape = orig_shape | |
| m.comfy_cast_weights = True | |
| else: | |
| q_weight = m.weight.to(dtype) | |
| # We keep it as a Parameter so it can be used in forward | |
| m.weight = torch.nn.Parameter(q_weight, requires_grad=False) | |
| # Enable weight casting so it dequantizes to input dtype on the fly | |
| if hasattr(m, "comfy_cast_weights"): | |
| m.comfy_cast_weights = True | |
| if hasattr(m, "bias") and m.bias is not None: | |
| # Biases are usually kept in higher precision | |
| pass | |
| def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False): | |
| mem_counter, patch_counter, lowvram_counter = 0, 0, 0 | |
| loading = sorted([(Device.module_size(m), n, m) for n, m in self.model.named_modules() | |
| if hasattr(m, "comfy_cast_weights") or hasattr(m, "weight")], reverse=True) | |
| load_completely = [] | |
| for module_mem, n, m in loading: | |
| lowvram_weight = not full_load and hasattr(m, "comfy_cast_weights") and mem_counter + module_mem >= lowvram_model_memory | |
| if lowvram_weight: | |
| lowvram_counter += 1 | |
| if hasattr(m, "prev_comfy_cast_weights"): | |
| continue | |
| if force_patch_weights: | |
| for pkey in [f"{n}.weight", f"{n}.bias"]: | |
| if pkey in self.patches: | |
| self.patch_weight_to_device(pkey) | |
| m.prev_comfy_cast_weights, m.comfy_cast_weights = m.comfy_cast_weights, True | |
| else: | |
| if hasattr(m, "comfy_cast_weights") and m.comfy_cast_weights: | |
| wipe_lowvram_weight(m) | |
| if hasattr(m, "weight"): | |
| mem_counter += module_mem | |
| load_completely.append((module_mem, n, m)) | |
| for _, n, m in sorted(load_completely, reverse=True): | |
| if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights: | |
| continue | |
| self.patch_weight_to_device(f"{n}.weight", device_to) | |
| self.patch_weight_to_device(f"{n}.bias", device_to) | |
| m.comfy_patched_weights = True | |
| for _, _, m in load_completely: | |
| m.to(device_to) | |
| if lowvram_counter > 0: | |
| logging.info(f"loaded partially {lowvram_model_memory / 1e6:.1f} {mem_counter / 1e6:.1f} {patch_counter}") | |
| self.model.model_lowvram = True | |
| else: | |
| logging.info(f"loaded completely {lowvram_model_memory / 1e6:.1f} {mem_counter / 1e6:.1f} {full_load}") | |
| self.model.model_lowvram = False | |
| if full_load: | |
| self.model.to(device_to) | |
| mem_counter = self.model_size() | |
| self.model.lowvram_patch_counter += patch_counter | |
| self.model.device, self.model.model_loaded_weight_memory = device_to, mem_counter | |
| def _apply_object_patches(self): | |
| for k in self.object_patches: | |
| old = util.set_attr(self.model, k, self.object_patches[k]) | |
| if k not in self.object_patches_backup: | |
| self.object_patches_backup[k] = old | |
| def patch_model_flux(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False): | |
| self._apply_object_patches() | |
| if load_weights: | |
| self.load(device_to, lowvram_model_memory, force_patch_weights, full_load=lowvram_model_memory == 0) | |
| return self.model | |
| def patch_model_lowvram_flux(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False): | |
| return self._patch_model_lowvram_impl(device_to, lowvram_model_memory, force_patch_weights) | |
| def patch_model(self, device_to=None, patch_weights=True): | |
| self._apply_object_patches() | |
| if patch_weights: | |
| for key in self.patches: | |
| if key not in self.model.state_dict(): | |
| logging.warning(f"could not patch. key doesn't exist in model: {key}") | |
| continue | |
| self.patch_weight_to_device(key, device_to) | |
| if device_to: | |
| self.model.to(device_to) | |
| self.current_device = device_to | |
| return self.model | |
| def patch_model_lowvram(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False): | |
| return self._patch_model_lowvram_impl(device_to, lowvram_model_memory, force_patch_weights) | |
| def _patch_model_lowvram_impl(self, device_to, lowvram_model_memory, force_patch_weights): | |
| self.patch_model(device_to, patch_weights=False) | |
| logging.info(f"loading in lowvram mode {lowvram_model_memory / 1e6:.1f}") | |
| mem_counter, patch_counter = 0, 0 | |
| for n, m in self.model.named_modules(): | |
| lowvram_weight = hasattr(m, "comfy_cast_weights") and mem_counter + Device.module_size(m) >= lowvram_model_memory | |
| if lowvram_weight: | |
| for pkey in [f"{n}.weight", f"{n}.bias"]: | |
| if pkey in self.patches: | |
| if force_patch_weights: | |
| self.patch_weight_to_device(pkey) | |
| else: | |
| setattr(m, 'weight_function' if 'weight' in pkey else 'bias_function', LowVramPatch(pkey, self)) | |
| patch_counter += 1 | |
| m.prev_comfy_cast_weights, m.comfy_cast_weights = m.comfy_cast_weights, True | |
| elif hasattr(m, "weight"): | |
| self.patch_weight_to_device(f"{n}.weight", device_to) | |
| self.patch_weight_to_device(f"{n}.bias", device_to) | |
| m.to(device_to) | |
| mem_counter += Device.module_size(m) | |
| self.model_lowvram, self.lowvram_patch_counter = True, patch_counter | |
| return self.model | |
| def calculate_weight(self, patches, weight, key): | |
| for p in patches: | |
| alpha, v = p[0], p[1][1] | |
| mat1 = Device.cast_to_device(v[0], weight.device, torch.float32) | |
| mat2 = Device.cast_to_device(v[1], weight.device, torch.float32) | |
| if v[2] is not None: | |
| alpha *= v[2] / mat2.shape[0] | |
| patch_shape = (mat1.shape[0], mat2.shape[1]) | |
| if patch_shape != weight.shape: | |
| # Handle cases where weight might be flattened but patch is not, or vice versa | |
| if mat1.flatten(start_dim=1).shape[0] * mat2.flatten(start_dim=1).shape[1] != weight.numel(): | |
| logging.warning(f"Skipping patch for {key}: shape mismatch. Weight: {weight.shape}, Patch: {patch_shape}") | |
| continue | |
| try: | |
| weight += (alpha * torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1))).reshape(weight.shape).type(weight.dtype) | |
| except Exception as e: | |
| logging.error(f"Failed to apply patch for {key}: {e}") | |
| continue | |
| return weight | |
| def unpatch_model(self, device_to=None, unpatch_weights=True): | |
| if unpatch_weights: | |
| for k in list(self.backup.keys()): | |
| util.set_attr_param(self.model, k, self.backup[k]) | |
| self.backup.clear() | |
| if device_to: | |
| self.model.to(device_to) | |
| self.current_device = device_to | |
| self.object_patches_backup.clear() | |
| def partially_load(self, device_to, extra_memory=0): | |
| self.unpatch_model(unpatch_weights=False) | |
| self.patch_model(patch_weights=False) | |
| if not self.model.model_lowvram: | |
| return 0 | |
| full_load = self.model.model_loaded_weight_memory + extra_memory > self.model_size() | |
| current_used = self.model.model_loaded_weight_memory | |
| self.load(device_to, lowvram_model_memory=current_used + extra_memory, full_load=full_load) | |
| return self.model.model_loaded_weight_memory - current_used | |
| def add_object_patch(self, name, obj): | |
| self.object_patches[name] = obj | |
| def apply_tome(self, ratio=0.5, max_downsample=1): | |
| if not TOMESD_AVAILABLE: | |
| logging.warning("Token Merging (tomesd) not available") | |
| return False | |
| try: | |
| tomesd.remove_patch(self) | |
| except: | |
| pass | |
| self.tome_enabled, self.tome_ratio = False, 0.5 | |
| try: | |
| if hasattr(self.model, 'diffusion_model'): | |
| tomesd.apply_patch(self, ratio=ratio, max_downsample=max_downsample) | |
| self.tome_enabled, self.tome_ratio = True, ratio | |
| logging.info(f"Applied Token Merging with ratio={ratio}, max_downsample={max_downsample}") | |
| return True | |
| return False | |
| except Exception as e: | |
| logging.error(f"Failed to apply Token Merging: {e}") | |
| return False | |
| def remove_tome(self): | |
| if not TOMESD_AVAILABLE or not self.tome_enabled: | |
| return False | |
| try: | |
| tomesd.remove_patch(self) | |
| self.tome_enabled, self.tome_ratio, self.tome_info = False, 0.5, {} | |
| return True | |
| except Exception as e: | |
| logging.error(f"Failed to remove Token Merging: {e}") | |
| return False | |
| def unet_prefix_from_state_dict(state_dict): | |
| counts = {c: sum(1 for k in state_dict if k.startswith(c)) for c in ["model.diffusion_model.", "model.model."]} | |
| top = max(counts, key=counts.get) | |
| return top if counts[top] > 5 else "model." | |
| def load_diffusion_model_state_dict(sd, model_options={}): | |
| dtype = model_options.get("dtype") | |
| prefix = unet_prefix_from_state_dict(sd) | |
| temp_sd = util.state_dict_prefix_replace(sd, {prefix: ""}, filter_keys=True) | |
| if len(temp_sd) > 0: | |
| sd = temp_sd | |
| parameters, load_device = util.calculate_parameters(sd), Device.get_torch_device() | |
| model_config = unet.model_config_from_unet(sd, "") | |
| offload_device = Device.unet_offload_device() | |
| unet_dtype2 = dtype or Device.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes) | |
| manual_cast_dtype = Device.unet_manual_cast(unet_dtype2, load_device, model_config.supported_inference_dtypes) | |
| model_config.set_inference_dtype(unet_dtype2, manual_cast_dtype) | |
| model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations) | |
| model = model_config.get_model(sd, "").to(offload_device) | |
| model.load_model_weights(sd, "") | |
| return ModelPatcher(model, load_device=load_device, offload_device=offload_device) | |