RealESRGAN

Paused

App Files Files Community

Fabrice-TIERCELIN commited on Jul 13, 2025

Commit

1480dd1

verified ·

1 Parent(s): 957014a

Delete utils

Browse files

Files changed (2) hide show

utils/fp8_optimization_utils.py +0 -277
utils/lora_utils.py +0 -234

utils/fp8_optimization_utils.py DELETED Viewed

@@ -1,277 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-from tqdm import tqdm
-def calculate_fp8_maxval(exp_bits=4, mantissa_bits=3, sign_bits=1):
-    """
-    Calculate the maximum representable value in FP8 format.
-    Default is E4M3 format (4-bit exponent, 3-bit mantissa, 1-bit sign).
-    Args:
-        exp_bits (int): Number of exponent bits
-        mantissa_bits (int): Number of mantissa bits
-        sign_bits (int): Number of sign bits (0 or 1)
-    Returns:
-        float: Maximum value representable in FP8 format
-    """
-    assert exp_bits + mantissa_bits + sign_bits == 8, "Total bits must be 8"
-    # Calculate exponent bias
-    bias = 2 ** (exp_bits - 1) - 1
-    # Calculate maximum mantissa value
-    mantissa_max = 1.0
-    for i in range(mantissa_bits - 1):
-        mantissa_max += 2 ** -(i + 1)
-    # Calculate maximum value
-    max_value = mantissa_max * (2 ** (2**exp_bits - 1 - bias))
-    return max_value
-def quantize_tensor_to_fp8(tensor, scale, exp_bits=4, mantissa_bits=3, sign_bits=1, max_value=None, min_value=None):
-    """
-    Quantize a tensor to FP8 format.
-    Args:
-        tensor (torch.Tensor): Tensor to quantize
-        scale (float or torch.Tensor): Scale factor
-        exp_bits (int): Number of exponent bits
-        mantissa_bits (int): Number of mantissa bits
-        sign_bits (int): Number of sign bits
-    Returns:
-        tuple: (quantized_tensor, scale_factor)
-    """
-    # Create scaled tensor
-    scaled_tensor = tensor / scale
-    # Calculate FP8 parameters
-    bias = 2 ** (exp_bits - 1) - 1
-    if max_value is None:
-        # Calculate max and min values
-        max_value = calculate_fp8_maxval(exp_bits, mantissa_bits, sign_bits)
-        min_value = -max_value if sign_bits > 0 else 0.0
-    # Clamp tensor to range
-    clamped_tensor = torch.clamp(scaled_tensor, min_value, max_value)
-    # Quantization process
-    abs_values = torch.abs(clamped_tensor)
-    nonzero_mask = abs_values > 0
-    # Calculate logF scales (only for non-zero elements)
-    log_scales = torch.zeros_like(clamped_tensor)
-    if nonzero_mask.any():
-        log_scales[nonzero_mask] = torch.floor(torch.log2(abs_values[nonzero_mask]) + bias).detach()
-    # Limit log scales and calculate quantization factor
-    log_scales = torch.clamp(log_scales, min=1.0)
-    quant_factor = 2.0 ** (log_scales - mantissa_bits - bias)
-    # Quantize and dequantize
-    quantized = torch.round(clamped_tensor / quant_factor) * quant_factor
-    return quantized, scale
-def optimize_state_dict_with_fp8(
-    state_dict, calc_device, target_layer_keys=None, exclude_layer_keys=None, exp_bits=4, mantissa_bits=3, move_to_device=False
-):
-    """
-    Optimize Linear layer weights in a model's state dict to FP8 format.
-    Args:
-        state_dict (dict): State dict to optimize, replaced in-place
-        calc_device (str): Device to quantize tensors on
-        target_layer_keys (list, optional): Layer key patterns to target (None for all Linear layers)
-        exclude_layer_keys (list, optional): Layer key patterns to exclude
-        exp_bits (int): Number of exponent bits
-        mantissa_bits (int): Number of mantissa bits
-        move_to_device (bool): Move optimized tensors to the calculating device
-    Returns:
-        dict: FP8 optimized state dict
-    """
-    if exp_bits == 4 and mantissa_bits == 3:
-        fp8_dtype = torch.float8_e4m3fn
-    elif exp_bits == 5 and mantissa_bits == 2:
-        fp8_dtype = torch.float8_e5m2
-    else:
-        raise ValueError(f"Unsupported FP8 format: E{exp_bits}M{mantissa_bits}")
-    # Calculate FP8 max value
-    max_value = calculate_fp8_maxval(exp_bits, mantissa_bits)
-    min_value = -max_value  # this function supports only signed FP8
-    # Create optimized state dict
-    optimized_count = 0
-    # Enumerate tarket keys
-    target_state_dict_keys = []
-    for key in state_dict.keys():
-        # Check if it's a weight key and matches target patterns
-        is_target = (target_layer_keys is None or any(pattern in key for pattern in target_layer_keys)) and key.endswith(".weight")
-        is_excluded = exclude_layer_keys is not None and any(pattern in key for pattern in exclude_layer_keys)
-        is_target = is_target and not is_excluded
-        if is_target and isinstance(state_dict[key], torch.Tensor):
-            target_state_dict_keys.append(key)
-    # Process each key
-    for key in tqdm(target_state_dict_keys):
-        value = state_dict[key]
-        # Save original device and dtype
-        original_device = value.device
-        original_dtype = value.dtype
-        # Move to calculation device
-        if calc_device is not None:
-            value = value.to(calc_device)
-        # Calculate scale factor
-        scale = torch.max(torch.abs(value.flatten())) / max_value
-        # print(f"Optimizing {key} with scale: {scale}")
-        # Quantize weight to FP8
-        quantized_weight, _ = quantize_tensor_to_fp8(value, scale, exp_bits, mantissa_bits, 1, max_value, min_value)
-        # Add to state dict using original key for weight and new key for scale
-        fp8_key = key  # Maintain original key
-        scale_key = key.replace(".weight", ".scale_weight")
-        quantized_weight = quantized_weight.to(fp8_dtype)
-        if not move_to_device:
-            quantized_weight = quantized_weight.to(original_device)
-        scale_tensor = torch.tensor([scale], dtype=original_dtype, device=quantized_weight.device)
-        state_dict[fp8_key] = quantized_weight
-        state_dict[scale_key] = scale_tensor
-        optimized_count += 1
-        if calc_device is not None:  # optimized_count % 10 == 0 and
-            # free memory on calculation device
-            torch.cuda.empty_cache()  # TODO check device typ
-    print(f"Number of optimized Linear layers: {optimized_count}")
-    return state_dict
-def fp8_linear_forward_patch(self: nn.Linear, x, use_scaled_mm=False, max_value=None):
-    """
-    Patched forward method for Linear layers with FP8 weights.
-    Args:
-        self: Linear layer instance
-        x (torch.Tensor): Input tensor
-        use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
-        max_value (float): Maximum value for FP8 quantization. If None, no quantization is applied for input tensor.
-    Returns:
-        torch.Tensor: Result of linear transformation
-    """
-    if use_scaled_mm:
-        input_dtype = x.dtype
-        original_weight_dtype = self.scale_weight.dtype
-        weight_dtype = self.weight.dtype
-        target_dtype = torch.float8_e5m2
-        assert weight_dtype == torch.float8_e4m3fn, "Only FP8 E4M3FN format is supported"
-        assert x.ndim == 3, "Input tensor must be 3D (batch_size, seq_len, hidden_dim)"
-        if max_value is None:
-            # no input quantization
-            scale_x = torch.tensor(1.0, dtype=torch.float32, device=x.device)
-        else:
-            # calculate scale factor for input tensor
-            scale_x = (torch.max(torch.abs(x.flatten())) / max_value).to(torch.float32)
-            # quantize input tensor to FP8: this seems to consume a lot of memory
-            x, _ = quantize_tensor_to_fp8(x, scale_x, 5, 2, 1, max_value, -max_value)
-        original_shape = x.shape
-        x = x.reshape(-1, x.shape[2]).to(target_dtype)
-        weight = self.weight.t()
-        scale_weight = self.scale_weight.to(torch.float32)
-        if self.bias is not None:
-            # float32 is not supported with bias in scaled_mm
-            o = torch._scaled_mm(x, weight, out_dtype=original_weight_dtype, bias=self.bias, scale_a=scale_x, scale_b=scale_weight)
-        else:
-            o = torch._scaled_mm(x, weight, out_dtype=input_dtype, scale_a=scale_x, scale_b=scale_weight)
-        return o.reshape(original_shape[0], original_shape[1], -1).to(input_dtype)
-    else:
-        # Dequantize the weight
-        original_dtype = self.scale_weight.dtype
-        dequantized_weight = self.weight.to(original_dtype) * self.scale_weight
-        # Perform linear transformation
-        if self.bias is not None:
-            output = F.linear(x, dequantized_weight, self.bias)
-        else:
-            output = F.linear(x, dequantized_weight)
-        return output
-def apply_fp8_monkey_patch(model, optimized_state_dict, use_scaled_mm=False):
-    """
-    Apply monkey patching to a model using FP8 optimized state dict.
-    Args:
-        model (nn.Module): Model instance to patch
-        optimized_state_dict (dict): FP8 optimized state dict
-        use_scaled_mm (bool): Use scaled_mm for FP8 Linear layers, requires SM 8.9+ (RTX 40 series)
-    Returns:
-        nn.Module: The patched model (same instance, modified in-place)
-    """
-    # # Calculate FP8 float8_e5m2 max value
-    # max_value = calculate_fp8_maxval(5, 2)
-    max_value = None  # do not quantize input tensor
-    # Find all scale keys to identify FP8-optimized layers
-    scale_keys = [k for k in optimized_state_dict.keys() if k.endswith(".scale_weight")]
-    # Enumerate patched layers
-    patched_module_paths = set()
-    for scale_key in scale_keys:
-        # Extract module path from scale key (remove .scale_weight)
-        module_path = scale_key.rsplit(".scale_weight", 1)[0]
-        patched_module_paths.add(module_path)
-    patched_count = 0
-    # Apply monkey patch to each layer with FP8 weights
-    for name, module in model.named_modules():
-        # Check if this module has a corresponding scale_weight
-        has_scale = name in patched_module_paths
-        # Apply patch if it's a Linear layer with FP8 scale
-        if isinstance(module, nn.Linear) and has_scale:
-            # register the scale_weight as a buffer to load the state_dict
-            module.register_buffer("scale_weight", torch.tensor(1.0, dtype=module.weight.dtype))
-            # Create a new forward method with the patched version.
-            def new_forward(self, x):
-                return fp8_linear_forward_patch(self, x, use_scaled_mm, max_value)
-            # Bind method to module
-            module.forward = new_forward.__get__(module, type(module))
-            patched_count += 1
-    print(f"Number of monkey-patched Linear layers: {patched_count}")
-    return model

utils/lora_utils.py DELETED Viewed

@@ -1,234 +0,0 @@
-import os
-import torch
-from safetensors.torch import load_file
-from tqdm import tqdm
-def merge_lora_to_state_dict(
-    state_dict: dict[str, torch.Tensor], lora_file: str, multiplier: float, device: torch.device
-) -> dict[str, torch.Tensor]:
-    """
-    Merge LoRA weights into the state dict of a model.
-    """
-    lora_sd = load_file(lora_file)
-    # Check the format of the LoRA file
-    keys = list(lora_sd.keys())
-    if keys[0].startswith("lora_unet_"):
-        print(f"Musubi Tuner LoRA detected")
-        return merge_musubi_tuner(lora_sd, state_dict, multiplier, device)
-    transformer_prefixes = ["diffusion_model", "transformer"]  # to ignore Text Encoder modules
-    lora_suffix = None
-    prefix = None
-    for key in keys:
-        if lora_suffix is None and "lora_A" in key:
-            lora_suffix = "lora_A"
-        if prefix is None:
-            pfx = key.split(".")[0]
-            if pfx in transformer_prefixes:
-                prefix = pfx
-        if lora_suffix is not None and prefix is not None:
-            break
-    if lora_suffix == "lora_A" and prefix is not None:
-        print(f"Diffusion-pipe (?) LoRA detected")
-        return merge_diffusion_pipe_or_something(lora_sd, state_dict, "lora_unet_", multiplier, device)
-    print(f"LoRA file format not recognized: {os.path.basename(lora_file)}")
-    return state_dict
-def merge_diffusion_pipe_or_something(
-    lora_sd: dict[str, torch.Tensor], state_dict: dict[str, torch.Tensor], prefix: str, multiplier: float, device: torch.device
-) -> dict[str, torch.Tensor]:
-    """
-    Convert LoRA weights to the format used by the diffusion pipeline to Musubi Tuner.
-    Copy from Musubi Tuner repo.
-    """
-    # convert from diffusers(?) to default LoRA
-    # Diffusers format: {"diffusion_model.module.name.lora_A.weight": weight, "diffusion_model.module.name.lora_B.weight": weight, ...}
-    # default LoRA format: {"prefix_module_name.lora_down.weight": weight, "prefix_module_name.lora_up.weight": weight, ...}
-    # note: Diffusers has no alpha, so alpha is set to rank
-    new_weights_sd = {}
-    lora_dims = {}
-    for key, weight in lora_sd.items():
-        diffusers_prefix, key_body = key.split(".", 1)
-        if diffusers_prefix != "diffusion_model" and diffusers_prefix != "transformer":
-            print(f"unexpected key: {key} in diffusers format")
-            continue
-        new_key = f"{prefix}{key_body}".replace(".", "_").replace("_lora_A_", ".lora_down.").replace("_lora_B_", ".lora_up.")
-        new_weights_sd[new_key] = weight
-        lora_name = new_key.split(".")[0]  # before first dot
-        if lora_name not in lora_dims and "lora_down" in new_key:
-            lora_dims[lora_name] = weight.shape[0]
-    # add alpha with rank
-    for lora_name, dim in lora_dims.items():
-        new_weights_sd[f"{lora_name}.alpha"] = torch.tensor(dim)
-    return merge_musubi_tuner(new_weights_sd, state_dict, multiplier, device)
-def merge_musubi_tuner(
-    lora_sd: dict[str, torch.Tensor], state_dict: dict[str, torch.Tensor], multiplier: float, device: torch.device
-) -> dict[str, torch.Tensor]:
-    """
-    Merge LoRA weights into the state dict of a model.
-    """
-    # Check LoRA is for FramePack or for HunyuanVideo
-    is_hunyuan = False
-    for key in lora_sd.keys():
-        if "double_blocks" in key or "single_blocks" in key:
-            is_hunyuan = True
-            break
-    if is_hunyuan:
-        print("HunyuanVideo LoRA detected, converting to FramePack format")
-        lora_sd = convert_hunyuan_to_framepack(lora_sd)
-    # Merge LoRA weights into the state dict
-    print(f"Merging LoRA weights into state dict. multiplier: {multiplier}")
-    # Create module map
-    name_to_original_key = {}
-    for key in state_dict.keys():
-        if key.endswith(".weight"):
-            lora_name = key.rsplit(".", 1)[0]  # remove trailing ".weight"
-            lora_name = "lora_unet_" + lora_name.replace(".", "_")
-            if lora_name not in name_to_original_key:
-                name_to_original_key[lora_name] = key
-    # Merge LoRA weights
-    keys = list([k for k in lora_sd.keys() if "lora_down" in k])
-    for key in tqdm(keys, desc="Merging LoRA weights"):
-        up_key = key.replace("lora_down", "lora_up")
-        alpha_key = key[: key.index("lora_down")] + "alpha"
-        # find original key for this lora
-        module_name = ".".join(key.split(".")[:-2])  # remove trailing ".lora_down.weight"
-        if module_name not in name_to_original_key:
-            print(f"No module found for LoRA weight: {key}")
-            continue
-        original_key = name_to_original_key[module_name]
-        down_weight = lora_sd[key]
-        up_weight = lora_sd[up_key]
-        dim = down_weight.size()[0]
-        alpha = lora_sd.get(alpha_key, dim)
-        scale = alpha / dim
-        weight = state_dict[original_key]
-        original_device = weight.device
-        if original_device != device:
-            weight = weight.to(device)  # to make calculation faster
-        down_weight = down_weight.to(device)
-        up_weight = up_weight.to(device)
-        # W <- W + U * D
-        if len(weight.size()) == 2:
-            # linear
-            if len(up_weight.size()) == 4:  # use linear projection mismatch
-                up_weight = up_weight.squeeze(3).squeeze(2)
-                down_weight = down_weight.squeeze(3).squeeze(2)
-            weight = weight + multiplier * (up_weight @ down_weight) * scale
-        elif down_weight.size()[2:4] == (1, 1):
-            # conv2d 1x1
-            weight = (
-                weight
-                + multiplier
-                * (up_weight.squeeze(3).squeeze(2) @ down_weight.squeeze(3).squeeze(2)).unsqueeze(2).unsqueeze(3)
-                * scale
-            )
-        else:
-            # conv2d 3x3
-            conved = torch.nn.functional.conv2d(down_weight.permute(1, 0, 2, 3), up_weight).permute(1, 0, 2, 3)
-            # logger.info(conved.size(), weight.size(), module.stride, module.padding)
-            weight = weight + multiplier * conved * scale
-        weight = weight.to(original_device)  # move back to original device
-        state_dict[original_key] = weight
-    return state_dict
-def convert_hunyuan_to_framepack(lora_sd: dict[str, torch.Tensor]) -> dict[str, torch.Tensor]:
-    """
-    Convert HunyuanVideo LoRA weights to FramePack format.
-    """
-    new_lora_sd = {}
-    for key, weight in lora_sd.items():
-        if "double_blocks" in key:
-            key = key.replace("double_blocks", "transformer_blocks")
-            key = key.replace("img_mod_linear", "norm1_linear")
-            key = key.replace("img_attn_qkv", "attn_to_QKV")  # split later
-            key = key.replace("img_attn_proj", "attn_to_out_0")
-            key = key.replace("img_mlp_fc1", "ff_net_0_proj")
-            key = key.replace("img_mlp_fc2", "ff_net_2")
-            key = key.replace("txt_mod_linear", "norm1_context_linear")
-            key = key.replace("txt_attn_qkv", "attn_add_QKV_proj")  # split later
-            key = key.replace("txt_attn_proj", "attn_to_add_out")
-            key = key.replace("txt_mlp_fc1", "ff_context_net_0_proj")
-            key = key.replace("txt_mlp_fc2", "ff_context_net_2")
-        elif "single_blocks" in key:
-            key = key.replace("single_blocks", "single_transformer_blocks")
-            key = key.replace("linear1", "attn_to_QKVM")  # split later
-            key = key.replace("linear2", "proj_out")
-            key = key.replace("modulation_linear", "norm_linear")
-        else:
-            print(f"Unsupported module name: {key}, only double_blocks and single_blocks are supported")
-            continue
-        if "QKVM" in key:
-            # split QKVM into Q, K, V, M
-            key_q = key.replace("QKVM", "q")
-            key_k = key.replace("QKVM", "k")
-            key_v = key.replace("QKVM", "v")
-            key_m = key.replace("attn_to_QKVM", "proj_mlp")
-            if "_down" in key or "alpha" in key:
-                # copy QKVM weight or alpha to Q, K, V, M
-                assert "alpha" in key or weight.size(1) == 3072, f"QKVM weight size mismatch: {key}. {weight.size()}"
-                new_lora_sd[key_q] = weight
-                new_lora_sd[key_k] = weight
-                new_lora_sd[key_v] = weight
-                new_lora_sd[key_m] = weight
-            elif "_up" in key:
-                # split QKVM weight into Q, K, V, M
-                assert weight.size(0) == 21504, f"QKVM weight size mismatch: {key}. {weight.size()}"
-                new_lora_sd[key_q] = weight[:3072]
-                new_lora_sd[key_k] = weight[3072 : 3072 * 2]
-                new_lora_sd[key_v] = weight[3072 * 2 : 3072 * 3]
-                new_lora_sd[key_m] = weight[3072 * 3 :]  # 21504 - 3072 * 3 = 12288
-            else:
-                print(f"Unsupported module name: {key}")
-                continue
-        elif "QKV" in key:
-            # split QKV into Q, K, V
-            key_q = key.replace("QKV", "q")
-            key_k = key.replace("QKV", "k")
-            key_v = key.replace("QKV", "v")
-            if "_down" in key or "alpha" in key:
-                # copy QKV weight or alpha to Q, K, V
-                assert "alpha" in key or weight.size(1) == 3072, f"QKV weight size mismatch: {key}. {weight.size()}"
-                new_lora_sd[key_q] = weight
-                new_lora_sd[key_k] = weight
-                new_lora_sd[key_v] = weight
-            elif "_up" in key:
-                # split QKV weight into Q, K, V
-                assert weight.size(0) == 3072 * 3, f"QKV weight size mismatch: {key}. {weight.size()}"
-                new_lora_sd[key_q] = weight[:3072]
-                new_lora_sd[key_k] = weight[3072 : 3072 * 2]
-                new_lora_sd[key_v] = weight[3072 * 2 :]
-            else:
-                print(f"Unsupported module name: {key}")
-                continue
-        else:
-            # no split needed
-            new_lora_sd[key] = weight
-    return new_lora_sd