Spaces:

Aatricks
/

LightDiffusion-Next

Running on Zero

File size: 20,646 Bytes

b701455

"""ModelPatcher - Handles LoRA and weight patching for models."""
import copy
import logging
import uuid
import torch
from src.Device import Device
from src.NeuralNetwork import unet
from src.Utilities import util

try:
    import tomesd
    import tomesd.patch
    TOMESD_AVAILABLE = True

    # Monkey-patch tomesd to support our transformer_options argument in _forward
    _original_make_tome_block = tomesd.patch.make_tome_block

    def _fixed_make_tome_block(block_class):
        cls = _original_make_tome_block(block_class)
        old_forward = cls._forward

        def new_forward(self, x, context=None, *args, **kwargs):
            return old_forward(self, x, context)

        cls._forward = new_forward
        return cls

    tomesd.patch.make_tome_block = _fixed_make_tome_block

except ImportError:
    TOMESD_AVAILABLE = False
    tomesd = None


def wipe_lowvram_weight(m):
    if hasattr(m, "prev_comfy_cast_weights"):
        m.comfy_cast_weights = m.prev_comfy_cast_weights
        del m.prev_comfy_cast_weights
    m.weight_function = m.bias_function = None


class LowVramPatch:
    def __init__(self, key: str, model_patcher: "ModelPatcher"):
        self.key, self.model_patcher = key, model_patcher

    def __call__(self, weight: torch.Tensor) -> torch.Tensor:
        return self.model_patcher.calculate_weight(self.model_patcher.patches[self.key], weight, self.key)


class ModelFunctionWrapperChain:
    """Compose multiple model_function_wrapper hooks without overwriting them.

    Several optimizations patch the same U-Net wrapper hook. Keeping only the
    last wrapper silently disables earlier optimizations. This chain preserves
    application order by making the most recently-added wrapper the outermost
    wrapper around the existing stack.
    """

    def __init__(self, wrappers=None):
        self.wrappers = list(wrappers or [])

    def add_outer(self, wrapper):
        self.wrappers.insert(0, wrapper)
        return self

    def __call__(self, model_function, params):
        return self._invoke(0, model_function, params)

    def _invoke(self, index, model_function, params):
        if index >= len(self.wrappers):
            return model_function(
                params["input"],
                params["timestep"],
                **params.get("c", {}),
            )

        wrapper = self.wrappers[index]

        def next_model_function(input_x, timestep, **c_kwargs):
            next_params = dict(params)
            next_params["input"] = input_x
            next_params["timestep"] = timestep
            next_params["c"] = c_kwargs
            return self._invoke(index + 1, model_function, next_params)

        return wrapper(next_model_function, params)

    def to(self, device):
        updated = []
        for wrapper in self.wrappers:
            if hasattr(wrapper, "to"):
                moved = wrapper.to(device)
                updated.append(moved if moved is not None else wrapper)
            else:
                updated.append(wrapper)
        self.wrappers = updated
        return self


class ModelPatcher:
    def __init__(self, model: torch.nn.Module, load_device: torch.device, offload_device: torch.device,
                 size: int = 0, current_device: torch.device = None, weight_inplace_update: bool = False):
        self.size, self.model, self.patches, self.backup = size, model, {}, {}
        self.object_patches, self.object_patches_backup = {}, {}
        self.model_options = {"transformer_options": {}}
        self.model_size()
        self.load_device, self.offload_device = load_device, offload_device
        self.current_device = current_device or self.offload_device
        self.weight_inplace_update, self.model_lowvram, self.lowvram_patch_counter = weight_inplace_update, False, 0
        self.patches_uuid = uuid.uuid4()
        self.tome_enabled, self.tome_ratio, self.tome_info = False, 0.5, {}
        for attr, default in [("model_loaded_weight_memory", 0), ("model_lowvram", False), ("lowvram_patch_counter", 0)]:
            if not hasattr(self.model, attr):
                setattr(self.model, attr, default)

    def named_modules(self):
        yield from self.model.diffusion_model.named_modules() if hasattr(self.model, 'diffusion_model') else iter([])

    def loaded_size(self):
        return self.model.model_loaded_weight_memory

    def model_size(self) -> int:
        if self.size > 0:
            return self.size
        self.size = Device.module_size(self.model)
        self.model_keys = set(self.model.state_dict().keys())
        return self.size

    def clone(self) -> "ModelPatcher":
        n = ModelPatcher(self.model, self.load_device, self.offload_device, self.size, self.current_device, self.weight_inplace_update)
        n.patches = {k: v[:] for k, v in self.patches.items()}
        n.patches_uuid, n.object_patches = self.patches_uuid, self.object_patches.copy()
        n.model_options, n.model_keys, n.backup = copy.deepcopy(self.model_options), self.model_keys, self.backup
        n.object_patches_backup = self.object_patches_backup
        return n

    def is_clone(self, other):
        return hasattr(other, "model") and self.model is other.model

    def memory_required(self, input_shape):
        return self.model.memory_required(input_shape=input_shape)

    def set_model_unet_function_wrapper(self, f):
        existing = self.model_options.get("model_function_wrapper")
        if existing is None:
            self.model_options["model_function_wrapper"] = f
            return

        if isinstance(existing, ModelFunctionWrapperChain):
            existing.add_outer(f)
            self.model_options["model_function_wrapper"] = existing
            return

        self.model_options["model_function_wrapper"] = ModelFunctionWrapperChain([f, existing])

    def set_model_denoise_mask_function(self, f):
        self.model_options["denoise_mask_function"] = f

    def get_model_object(self, name):
        return util.get_attr(self.model, name)

    def model_patches_to(self, device):
        wrap_func = self.model_options.get("model_function_wrapper")
        if wrap_func and hasattr(wrap_func, "to"):
            self.model_options["model_function_wrapper"] = wrap_func.to(device)

    def model_dtype(self):
        return self.model.get_dtype() if hasattr(self.model, "get_dtype") else None

    def add_patches(self, patches, strength_patch=1.0, strength_model=1.0):
        p = set()
        for k in patches:
            if k in self.model_keys:
                p.add(k)
                self.patches[k] = self.patches.get(k, []) + [(strength_patch, patches[k], strength_model)]
        self.patches_uuid = uuid.uuid4()
        return list(p)

    def set_model_patch(self, patch, name):
        to = self.model_options["transformer_options"]
        to.setdefault("patches", {})[name] = to.get("patches", {}).get(name, []) + [patch]

    def set_model_attn1_patch(self, patch):
        self.set_model_patch(patch, "attn1_patch")

    def set_model_attn2_patch(self, patch):
        self.set_model_patch(patch, "attn2_patch")

    def set_model_attn1_output_patch(self, patch):
        self.set_model_patch(patch, "attn1_output_patch")

    def set_model_attn2_output_patch(self, patch):
        self.set_model_patch(patch, "attn2_output_patch")

    def model_state_dict(self, filter_prefix=None):
        return self.model.state_dict()

    def patch_weight_to_device(self, key, device_to=None):
        if key not in self.patches:
            return
        weight = util.get_attr(self.model, key)
        if key not in self.backup:
            self.backup[key] = weight.to(device=self.offload_device, copy=self.weight_inplace_update)
        temp_weight = Device.cast_to_device(weight, device_to, torch.float32, copy=True) if device_to else weight.to(torch.float32, copy=True)
        out_weight = self.calculate_weight(self.patches[key], temp_weight, key).to(weight.dtype)
        (util.copy_to_param if self.weight_inplace_update else util.set_attr_param)(self.model, key, out_weight)

    def weight_only_quantize(self, dtype: torch.dtype | str = torch.float8_e4m3fn):
        """Quantize all model weights to the target dtype or format (weight-only)."""
        if isinstance(dtype, str):
            format_name = dtype.lower()
        else:
            format_name = str(dtype)

        logging.info(f"Quantizing model weights to {format_name}")
        
        with torch.no_grad():
            for n, m in self.model.named_modules():
                if hasattr(m, "weight") and m.weight is not None:
                    # Don't quantize small tensors or non-float weights
                    if m.weight.numel() > 4096 and m.weight.is_floating_point() and m.weight.ndim == 2:
                        if format_name == "nvfp4":
                            from src.Utilities.Quantization import quantize_nvfp4, from_blocked
                            orig_shape = m.weight.shape
                            q_weight, tensor_scale, blocked_scales = quantize_nvfp4(m.weight)
                            
                            m.weight = torch.nn.Parameter(q_weight, requires_grad=False)
                            m.quant_format = "nvfp4"
                            
                            # Register as buffers so they move with the model
                            # (They are automatically handled by CastWeightBiasOp via getattr)
                            m.register_buffer("weight_scale_2", tensor_scale)
                            
                            # Pre-de-block scales to save compute during inference
                            rows, cols = orig_shape
                            block_cols = (cols + 15) // 16
                            deblocked_scales = from_blocked(blocked_scales, rows, block_cols)
                            m.register_buffer("weight_scale", deblocked_scales)
                                
                            m.original_shape = orig_shape
                            m.comfy_cast_weights = True
                        else:
                            q_weight = m.weight.to(dtype)
                            # We keep it as a Parameter so it can be used in forward
                            m.weight = torch.nn.Parameter(q_weight, requires_grad=False)
                            # Enable weight casting so it dequantizes to input dtype on the fly
                            if hasattr(m, "comfy_cast_weights"):
                                m.comfy_cast_weights = True
                if hasattr(m, "bias") and m.bias is not None:
                    # Biases are usually kept in higher precision
                    pass

    def load(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False, full_load=False):
        mem_counter, patch_counter, lowvram_counter = 0, 0, 0
        loading = sorted([(Device.module_size(m), n, m) for n, m in self.model.named_modules()
                          if hasattr(m, "comfy_cast_weights") or hasattr(m, "weight")], reverse=True)
        load_completely = []
        for module_mem, n, m in loading:
            lowvram_weight = not full_load and hasattr(m, "comfy_cast_weights") and mem_counter + module_mem >= lowvram_model_memory
            if lowvram_weight:
                lowvram_counter += 1
                if hasattr(m, "prev_comfy_cast_weights"):
                    continue
                if force_patch_weights:
                    for pkey in [f"{n}.weight", f"{n}.bias"]:
                        if pkey in self.patches:
                            self.patch_weight_to_device(pkey)
                m.prev_comfy_cast_weights, m.comfy_cast_weights = m.comfy_cast_weights, True
            else:
                if hasattr(m, "comfy_cast_weights") and m.comfy_cast_weights:
                    wipe_lowvram_weight(m)
                if hasattr(m, "weight"):
                    mem_counter += module_mem
                    load_completely.append((module_mem, n, m))
        for _, n, m in sorted(load_completely, reverse=True):
            if hasattr(m, "comfy_patched_weights") and m.comfy_patched_weights:
                continue
            self.patch_weight_to_device(f"{n}.weight", device_to)
            self.patch_weight_to_device(f"{n}.bias", device_to)
            m.comfy_patched_weights = True
        for _, _, m in load_completely:
            m.to(device_to)
        if lowvram_counter > 0:
            logging.info(f"loaded partially {lowvram_model_memory / 1e6:.1f} {mem_counter / 1e6:.1f} {patch_counter}")
            self.model.model_lowvram = True
        else:
            logging.info(f"loaded completely {lowvram_model_memory / 1e6:.1f} {mem_counter / 1e6:.1f} {full_load}")
            self.model.model_lowvram = False
            if full_load:
                self.model.to(device_to)
                mem_counter = self.model_size()
        self.model.lowvram_patch_counter += patch_counter
        self.model.device, self.model.model_loaded_weight_memory = device_to, mem_counter

    def _apply_object_patches(self):
        for k in self.object_patches:
            old = util.set_attr(self.model, k, self.object_patches[k])
            if k not in self.object_patches_backup:
                self.object_patches_backup[k] = old

    def patch_model_flux(self, device_to=None, lowvram_model_memory=0, load_weights=True, force_patch_weights=False):
        self._apply_object_patches()
        if load_weights:
            self.load(device_to, lowvram_model_memory, force_patch_weights, full_load=lowvram_model_memory == 0)
        return self.model

    def patch_model_lowvram_flux(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False):
        return self._patch_model_lowvram_impl(device_to, lowvram_model_memory, force_patch_weights)

    def patch_model(self, device_to=None, patch_weights=True):
        self._apply_object_patches()
        if patch_weights:
            for key in self.patches:
                if key not in self.model.state_dict():
                    logging.warning(f"could not patch. key doesn't exist in model: {key}")
                    continue
                self.patch_weight_to_device(key, device_to)
            if device_to:
                self.model.to(device_to)
                self.current_device = device_to
        return self.model

    def patch_model_lowvram(self, device_to=None, lowvram_model_memory=0, force_patch_weights=False):
        return self._patch_model_lowvram_impl(device_to, lowvram_model_memory, force_patch_weights)

    def _patch_model_lowvram_impl(self, device_to, lowvram_model_memory, force_patch_weights):
        self.patch_model(device_to, patch_weights=False)
        logging.info(f"loading in lowvram mode {lowvram_model_memory / 1e6:.1f}")
        mem_counter, patch_counter = 0, 0
        for n, m in self.model.named_modules():
            lowvram_weight = hasattr(m, "comfy_cast_weights") and mem_counter + Device.module_size(m) >= lowvram_model_memory
            if lowvram_weight:
                for pkey in [f"{n}.weight", f"{n}.bias"]:
                    if pkey in self.patches:
                        if force_patch_weights:
                            self.patch_weight_to_device(pkey)
                        else:
                            setattr(m, 'weight_function' if 'weight' in pkey else 'bias_function', LowVramPatch(pkey, self))
                            patch_counter += 1
                m.prev_comfy_cast_weights, m.comfy_cast_weights = m.comfy_cast_weights, True
            elif hasattr(m, "weight"):
                self.patch_weight_to_device(f"{n}.weight", device_to)
                self.patch_weight_to_device(f"{n}.bias", device_to)
                m.to(device_to)
                mem_counter += Device.module_size(m)
        self.model_lowvram, self.lowvram_patch_counter = True, patch_counter
        return self.model

    def calculate_weight(self, patches, weight, key):
        for p in patches:
            alpha, v = p[0], p[1][1]
            mat1 = Device.cast_to_device(v[0], weight.device, torch.float32)
            mat2 = Device.cast_to_device(v[1], weight.device, torch.float32)
            if v[2] is not None:
                alpha *= v[2] / mat2.shape[0]
            
            patch_shape = (mat1.shape[0], mat2.shape[1])
            if patch_shape != weight.shape:
                # Handle cases where weight might be flattened but patch is not, or vice versa
                if mat1.flatten(start_dim=1).shape[0] * mat2.flatten(start_dim=1).shape[1] != weight.numel():
                    logging.warning(f"Skipping patch for {key}: shape mismatch. Weight: {weight.shape}, Patch: {patch_shape}")
                    continue
                
            try:
                weight += (alpha * torch.mm(mat1.flatten(start_dim=1), mat2.flatten(start_dim=1))).reshape(weight.shape).type(weight.dtype)
            except Exception as e:
                logging.error(f"Failed to apply patch for {key}: {e}")
                continue
        return weight

    def unpatch_model(self, device_to=None, unpatch_weights=True):
        if unpatch_weights:
            for k in list(self.backup.keys()):
                util.set_attr_param(self.model, k, self.backup[k])
            self.backup.clear()
            if device_to:
                self.model.to(device_to)
                self.current_device = device_to
        self.object_patches_backup.clear()

    def partially_load(self, device_to, extra_memory=0):
        self.unpatch_model(unpatch_weights=False)
        self.patch_model(patch_weights=False)
        if not self.model.model_lowvram:
            return 0
        full_load = self.model.model_loaded_weight_memory + extra_memory > self.model_size()
        current_used = self.model.model_loaded_weight_memory
        self.load(device_to, lowvram_model_memory=current_used + extra_memory, full_load=full_load)
        return self.model.model_loaded_weight_memory - current_used

    def add_object_patch(self, name, obj):
        self.object_patches[name] = obj

    def apply_tome(self, ratio=0.5, max_downsample=1):
        if not TOMESD_AVAILABLE:
            logging.warning("Token Merging (tomesd) not available")
            return False
        try:
            tomesd.remove_patch(self)
        except:
            pass
        self.tome_enabled, self.tome_ratio = False, 0.5
        try:
            if hasattr(self.model, 'diffusion_model'):
                tomesd.apply_patch(self, ratio=ratio, max_downsample=max_downsample)
                self.tome_enabled, self.tome_ratio = True, ratio
                logging.info(f"Applied Token Merging with ratio={ratio}, max_downsample={max_downsample}")
                return True
            return False
        except Exception as e:
            logging.error(f"Failed to apply Token Merging: {e}")
            return False

    def remove_tome(self):
        if not TOMESD_AVAILABLE or not self.tome_enabled:
            return False
        try:
            tomesd.remove_patch(self)
            self.tome_enabled, self.tome_ratio, self.tome_info = False, 0.5, {}
            return True
        except Exception as e:
            logging.error(f"Failed to remove Token Merging: {e}")
            return False


def unet_prefix_from_state_dict(state_dict):
    counts = {c: sum(1 for k in state_dict if k.startswith(c)) for c in ["model.diffusion_model.", "model.model."]}
    top = max(counts, key=counts.get)
    return top if counts[top] > 5 else "model."


def load_diffusion_model_state_dict(sd, model_options={}):
    dtype = model_options.get("dtype")
    prefix = unet_prefix_from_state_dict(sd)
    temp_sd = util.state_dict_prefix_replace(sd, {prefix: ""}, filter_keys=True)
    if len(temp_sd) > 0:
        sd = temp_sd
    parameters, load_device = util.calculate_parameters(sd), Device.get_torch_device()
    model_config = unet.model_config_from_unet(sd, "")
    offload_device = Device.unet_offload_device()
    unet_dtype2 = dtype or Device.unet_dtype(model_params=parameters, supported_dtypes=model_config.supported_inference_dtypes)
    manual_cast_dtype = Device.unet_manual_cast(unet_dtype2, load_device, model_config.supported_inference_dtypes)
    model_config.set_inference_dtype(unet_dtype2, manual_cast_dtype)
    model_config.custom_operations = model_options.get("custom_operations", model_config.custom_operations)
    model = model_config.get_model(sd, "").to(offload_device)
    model.load_model_weights(sd, "")
    return ModelPatcher(model, load_device=load_device, offload_device=offload_device)