Spaces:

iimmortall
/

InstantRetouch

Running on Zero

App Files Files Community

InstantRetouch / vendor /diffusers /utils /state_dict_utils.py

iimmortall

Deploy InstantRetouch BILA ZeroGPU Space

bc275c2 verified 2 days ago

raw

history blame contribute delete

23.1 kB

	# Copyright 2025 The HuggingFace Team. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""
	State dict utilities: utility methods for converting state dicts easily
	"""

	import enum
	import json

	from .import_utils import is_torch_available
	from .logging import get_logger


	if is_torch_available():
	import torch


	logger = get_logger(__name__)


	class StateDictType(enum.Enum):
	"""
	The mode to use when converting state dicts.
	"""

	DIFFUSERS_OLD = "diffusers_old"
	KOHYA_SS = "kohya_ss"
	PEFT = "peft"
	DIFFUSERS = "diffusers"


	# We need to define a proper mapping for Unet since it uses different output keys than text encoder
	# e.g. to_q_lora -> q_proj / to_q
	UNET_TO_DIFFUSERS = {
	".to_out_lora.up": ".to_out.0.lora_B",
	".to_out_lora.down": ".to_out.0.lora_A",
	".to_q_lora.down": ".to_q.lora_A",
	".to_q_lora.up": ".to_q.lora_B",
	".to_k_lora.down": ".to_k.lora_A",
	".to_k_lora.up": ".to_k.lora_B",
	".to_v_lora.down": ".to_v.lora_A",
	".to_v_lora.up": ".to_v.lora_B",
	".lora.up": ".lora_B",
	".lora.down": ".lora_A",
	".to_out.lora_magnitude_vector": ".to_out.0.lora_magnitude_vector",
	}

	CONTROL_LORA_TO_DIFFUSERS = {
	".to_q.down": ".to_q.lora_A.weight",
	".to_q.up": ".to_q.lora_B.weight",
	".to_k.down": ".to_k.lora_A.weight",
	".to_k.up": ".to_k.lora_B.weight",
	".to_v.down": ".to_v.lora_A.weight",
	".to_v.up": ".to_v.lora_B.weight",
	".to_out.0.down": ".to_out.0.lora_A.weight",
	".to_out.0.up": ".to_out.0.lora_B.weight",
	".ff.net.0.proj.down": ".ff.net.0.proj.lora_A.weight",
	".ff.net.0.proj.up": ".ff.net.0.proj.lora_B.weight",
	".ff.net.2.down": ".ff.net.2.lora_A.weight",
	".ff.net.2.up": ".ff.net.2.lora_B.weight",
	".proj_in.down": ".proj_in.lora_A.weight",
	".proj_in.up": ".proj_in.lora_B.weight",
	".proj_out.down": ".proj_out.lora_A.weight",
	".proj_out.up": ".proj_out.lora_B.weight",
	".conv.down": ".conv.lora_A.weight",
	".conv.up": ".conv.lora_B.weight",
	**{f".conv{i}.down": f".conv{i}.lora_A.weight" for i in range(1, 3)},
	**{f".conv{i}.up": f".conv{i}.lora_B.weight" for i in range(1, 3)},
	"conv_in.down": "conv_in.lora_A.weight",
	"conv_in.up": "conv_in.lora_B.weight",
	".conv_shortcut.down": ".conv_shortcut.lora_A.weight",
	".conv_shortcut.up": ".conv_shortcut.lora_B.weight",
	**{f".linear_{i}.down": f".linear_{i}.lora_A.weight" for i in range(1, 3)},
	**{f".linear_{i}.up": f".linear_{i}.lora_B.weight" for i in range(1, 3)},
	"time_emb_proj.down": "time_emb_proj.lora_A.weight",
	"time_emb_proj.up": "time_emb_proj.lora_B.weight",
	}

	DIFFUSERS_TO_PEFT = {
	".q_proj.lora_linear_layer.up": ".q_proj.lora_B",
	".q_proj.lora_linear_layer.down": ".q_proj.lora_A",
	".k_proj.lora_linear_layer.up": ".k_proj.lora_B",
	".k_proj.lora_linear_layer.down": ".k_proj.lora_A",
	".v_proj.lora_linear_layer.up": ".v_proj.lora_B",
	".v_proj.lora_linear_layer.down": ".v_proj.lora_A",
	".out_proj.lora_linear_layer.up": ".out_proj.lora_B",
	".out_proj.lora_linear_layer.down": ".out_proj.lora_A",
	".lora_linear_layer.up": ".lora_B",
	".lora_linear_layer.down": ".lora_A",
	"text_projection.lora.down.weight": "text_projection.lora_A.weight",
	"text_projection.lora.up.weight": "text_projection.lora_B.weight",
	}

	DIFFUSERS_OLD_TO_PEFT = {
	".to_q_lora.up": ".q_proj.lora_B",
	".to_q_lora.down": ".q_proj.lora_A",
	".to_k_lora.up": ".k_proj.lora_B",
	".to_k_lora.down": ".k_proj.lora_A",
	".to_v_lora.up": ".v_proj.lora_B",
	".to_v_lora.down": ".v_proj.lora_A",
	".to_out_lora.up": ".out_proj.lora_B",
	".to_out_lora.down": ".out_proj.lora_A",
	".lora_linear_layer.up": ".lora_B",
	".lora_linear_layer.down": ".lora_A",
	}

	PEFT_TO_DIFFUSERS = {
	".q_proj.lora_B": ".q_proj.lora_linear_layer.up",
	".q_proj.lora_A": ".q_proj.lora_linear_layer.down",
	".k_proj.lora_B": ".k_proj.lora_linear_layer.up",
	".k_proj.lora_A": ".k_proj.lora_linear_layer.down",
	".v_proj.lora_B": ".v_proj.lora_linear_layer.up",
	".v_proj.lora_A": ".v_proj.lora_linear_layer.down",
	".out_proj.lora_B": ".out_proj.lora_linear_layer.up",
	".out_proj.lora_A": ".out_proj.lora_linear_layer.down",
	"to_k.lora_A": "to_k.lora.down",
	"to_k.lora_B": "to_k.lora.up",
	"to_q.lora_A": "to_q.lora.down",
	"to_q.lora_B": "to_q.lora.up",
	"to_v.lora_A": "to_v.lora.down",
	"to_v.lora_B": "to_v.lora.up",
	"to_out.0.lora_A": "to_out.0.lora.down",
	"to_out.0.lora_B": "to_out.0.lora.up",
	}

	DIFFUSERS_OLD_TO_DIFFUSERS = {
	".to_q_lora.up": ".q_proj.lora_linear_layer.up",
	".to_q_lora.down": ".q_proj.lora_linear_layer.down",
	".to_k_lora.up": ".k_proj.lora_linear_layer.up",
	".to_k_lora.down": ".k_proj.lora_linear_layer.down",
	".to_v_lora.up": ".v_proj.lora_linear_layer.up",
	".to_v_lora.down": ".v_proj.lora_linear_layer.down",
	".to_out_lora.up": ".out_proj.lora_linear_layer.up",
	".to_out_lora.down": ".out_proj.lora_linear_layer.down",
	".to_k.lora_magnitude_vector": ".k_proj.lora_magnitude_vector",
	".to_v.lora_magnitude_vector": ".v_proj.lora_magnitude_vector",
	".to_q.lora_magnitude_vector": ".q_proj.lora_magnitude_vector",
	".to_out.lora_magnitude_vector": ".out_proj.lora_magnitude_vector",
	}

	PEFT_TO_KOHYA_SS = {
	"lora_A": "lora_down",
	"lora_B": "lora_up",
	# This is not a comprehensive dict as kohya format requires replacing `.` with `_` in keys,
	# adding prefixes and adding alpha values
	# Check `convert_state_dict_to_kohya` for more
	}

	PEFT_STATE_DICT_MAPPINGS = {
	StateDictType.DIFFUSERS_OLD: DIFFUSERS_OLD_TO_PEFT,
	StateDictType.DIFFUSERS: DIFFUSERS_TO_PEFT,
	}

	DIFFUSERS_STATE_DICT_MAPPINGS = {
	StateDictType.DIFFUSERS_OLD: DIFFUSERS_OLD_TO_DIFFUSERS,
	StateDictType.PEFT: PEFT_TO_DIFFUSERS,
	}

	KOHYA_STATE_DICT_MAPPINGS = {StateDictType.PEFT: PEFT_TO_KOHYA_SS}

	KEYS_TO_ALWAYS_REPLACE = {
	".processor.": ".",
	}


	def convert_state_dict(state_dict, mapping):
	r"""
	Simply iterates over the state dict and replaces the patterns in `mapping` with the corresponding values.

	Args:
	state_dict (`dict[str, torch.Tensor]`):
	The state dict to convert.
	mapping (`dict[str, str]`):
	The mapping to use for conversion, the mapping should be a dictionary with the following structure:
	- key: the pattern to replace
	- value: the pattern to replace with

	Returns:
	converted_state_dict (`dict`)
	The converted state dict.
	"""
	converted_state_dict = {}
	for k, v in state_dict.items():
	# First, filter out the keys that we always want to replace
	for pattern in KEYS_TO_ALWAYS_REPLACE.keys():
	if pattern in k:
	new_pattern = KEYS_TO_ALWAYS_REPLACE[pattern]
	k = k.replace(pattern, new_pattern)

	for pattern in mapping.keys():
	if pattern in k:
	new_pattern = mapping[pattern]
	k = k.replace(pattern, new_pattern)
	break
	converted_state_dict[k] = v
	return converted_state_dict


	def convert_state_dict_to_peft(state_dict, original_type=None, **kwargs):
	r"""
	Converts a state dict to the PEFT format The state dict can be from previous diffusers format (`OLD_DIFFUSERS`), or
	new diffusers format (`DIFFUSERS`). The method only supports the conversion from diffusers old/new to PEFT for now.

	Args:
	state_dict (`dict[str, torch.Tensor]`):
	The state dict to convert.
	original_type (`StateDictType`, optional):
	The original type of the state dict, if not provided, the method will try to infer it automatically.
	"""
	if original_type is None:
	# Old diffusers to PEFT
	if any("to_out_lora" in k for k in state_dict.keys()):
	original_type = StateDictType.DIFFUSERS_OLD
	elif any("lora_linear_layer" in k for k in state_dict.keys()):
	original_type = StateDictType.DIFFUSERS
	else:
	raise ValueError("Could not automatically infer state dict type")

	if original_type not in PEFT_STATE_DICT_MAPPINGS.keys():
	raise ValueError(f"Original type {original_type} is not supported")

	mapping = PEFT_STATE_DICT_MAPPINGS[original_type]
	return convert_state_dict(state_dict, mapping)


	def convert_state_dict_to_diffusers(state_dict, original_type=None, **kwargs):
	r"""
	Converts a state dict to new diffusers format. The state dict can be from previous diffusers format
	(`OLD_DIFFUSERS`), or PEFT format (`PEFT`) or new diffusers format (`DIFFUSERS`). In the last case the method will
	return the state dict as is.

	The method only supports the conversion from diffusers old, PEFT to diffusers new for now.

	Args:
	state_dict (`dict[str, torch.Tensor]`):
	The state dict to convert.
	original_type (`StateDictType`, optional):
	The original type of the state dict, if not provided, the method will try to infer it automatically.
	kwargs (`dict`, args):
	Additional arguments to pass to the method.

	- adapter_name: For example, in case of PEFT, some keys will be prepended
	with the adapter name, therefore needs a special handling. By default PEFT also takes care of that in
	`get_peft_model_state_dict` method:
	https://github.com/huggingface/peft/blob/ba0477f2985b1ba311b83459d29895c809404e99/src/peft/utils/save_and_load.py#L92
	but we add it here in case we don't want to rely on that method.
	"""
	peft_adapter_name = kwargs.pop("adapter_name", None)
	if peft_adapter_name is not None:
	peft_adapter_name = "." + peft_adapter_name
	else:
	peft_adapter_name = ""

	if original_type is None:
	# Old diffusers to PEFT
	if any("to_out_lora" in k for k in state_dict.keys()):
	original_type = StateDictType.DIFFUSERS_OLD
	elif any(f".lora_A{peft_adapter_name}.weight" in k for k in state_dict.keys()):
	original_type = StateDictType.PEFT
	elif any("lora_linear_layer" in k for k in state_dict.keys()):
	# nothing to do
	return state_dict
	else:
	raise ValueError("Could not automatically infer state dict type")

	if original_type not in DIFFUSERS_STATE_DICT_MAPPINGS.keys():
	raise ValueError(f"Original type {original_type} is not supported")

	mapping = DIFFUSERS_STATE_DICT_MAPPINGS[original_type]
	return convert_state_dict(state_dict, mapping)


	def convert_unet_state_dict_to_peft(state_dict):
	r"""
	Converts a state dict from UNet format to diffusers format - i.e. by removing some keys
	"""
	mapping = UNET_TO_DIFFUSERS
	return convert_state_dict(state_dict, mapping)


	def convert_sai_sd_control_lora_state_dict_to_peft(state_dict):
	def _convert_controlnet_to_diffusers(state_dict):
	is_sdxl = "input_blocks.11.0.in_layers.0.weight" not in state_dict
	logger.info(f"Using ControlNet lora ({'SDXL' if is_sdxl else 'SD15'})")

	# Retrieves the keys for the input blocks only
	num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in state_dict if "input_blocks" in layer})
	input_blocks = {
	layer_id: [key for key in state_dict if f"input_blocks.{layer_id}" in key]
	for layer_id in range(num_input_blocks)
	}
	layers_per_block = 2

	# op blocks
	op_blocks = [key for key in state_dict if "0.op" in key]

	converted_state_dict = {}
	# Conv in layers
	for key in input_blocks[0]:
	diffusers_key = key.replace("input_blocks.0.0", "conv_in")
	converted_state_dict[diffusers_key] = state_dict.get(key)

	# controlnet time embedding blocks
	time_embedding_blocks = [key for key in state_dict if "time_embed" in key]
	for key in time_embedding_blocks:
	diffusers_key = key.replace("time_embed.0", "time_embedding.linear_1").replace(
	"time_embed.2", "time_embedding.linear_2"
	)
	converted_state_dict[diffusers_key] = state_dict.get(key)

	# controlnet label embedding blocks
	label_embedding_blocks = [key for key in state_dict if "label_emb" in key]
	for key in label_embedding_blocks:
	diffusers_key = key.replace("label_emb.0.0", "add_embedding.linear_1").replace(
	"label_emb.0.2", "add_embedding.linear_2"
	)
	converted_state_dict[diffusers_key] = state_dict.get(key)

	# Down blocks
	for i in range(1, num_input_blocks):
	block_id = (i - 1) // (layers_per_block + 1)
	layer_in_block_id = (i - 1) % (layers_per_block + 1)

	resnets = [
	key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
	]
	for key in resnets:
	diffusers_key = (
	key.replace("in_layers.0", "norm1")
	.replace("in_layers.2", "conv1")
	.replace("out_layers.0", "norm2")
	.replace("out_layers.3", "conv2")
	.replace("emb_layers.1", "time_emb_proj")
	.replace("skip_connection", "conv_shortcut")
	)
	diffusers_key = diffusers_key.replace(
	f"input_blocks.{i}.0", f"down_blocks.{block_id}.resnets.{layer_in_block_id}"
	)
	converted_state_dict[diffusers_key] = state_dict.get(key)

	if f"input_blocks.{i}.0.op.bias" in state_dict:
	for key in [key for key in op_blocks if f"input_blocks.{i}.0.op" in key]:
	diffusers_key = key.replace(
	f"input_blocks.{i}.0.op", f"down_blocks.{block_id}.downsamplers.0.conv"
	)
	converted_state_dict[diffusers_key] = state_dict.get(key)

	attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
	if attentions:
	for key in attentions:
	diffusers_key = key.replace(
	f"input_blocks.{i}.1", f"down_blocks.{block_id}.attentions.{layer_in_block_id}"
	)
	converted_state_dict[diffusers_key] = state_dict.get(key)

	# controlnet down blocks
	for i in range(num_input_blocks):
	converted_state_dict[f"controlnet_down_blocks.{i}.weight"] = state_dict.get(f"zero_convs.{i}.0.weight")
	converted_state_dict[f"controlnet_down_blocks.{i}.bias"] = state_dict.get(f"zero_convs.{i}.0.bias")

	# Retrieves the keys for the middle blocks only
	num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in state_dict if "middle_block" in layer})
	middle_blocks = {
	layer_id: [key for key in state_dict if f"middle_block.{layer_id}" in key]
	for layer_id in range(num_middle_blocks)
	}

	# Mid blocks
	for key in middle_blocks.keys():
	diffusers_key = max(key - 1, 0)
	if key % 2 == 0:
	for k in middle_blocks[key]:
	diffusers_key_hf = (
	k.replace("in_layers.0", "norm1")
	.replace("in_layers.2", "conv1")
	.replace("out_layers.0", "norm2")
	.replace("out_layers.3", "conv2")
	.replace("emb_layers.1", "time_emb_proj")
	.replace("skip_connection", "conv_shortcut")
	)
	diffusers_key_hf = diffusers_key_hf.replace(
	f"middle_block.{key}", f"mid_block.resnets.{diffusers_key}"
	)
	converted_state_dict[diffusers_key_hf] = state_dict.get(k)
	else:
	for k in middle_blocks[key]:
	diffusers_key_hf = k.replace(f"middle_block.{key}", f"mid_block.attentions.{diffusers_key}")
	converted_state_dict[diffusers_key_hf] = state_dict.get(k)

	# mid block
	converted_state_dict["controlnet_mid_block.weight"] = state_dict.get("middle_block_out.0.weight")
	converted_state_dict["controlnet_mid_block.bias"] = state_dict.get("middle_block_out.0.bias")

	# controlnet cond embedding blocks
	cond_embedding_blocks = {
	".".join(layer.split(".")[:2])
	for layer in state_dict
	if "input_hint_block" in layer
	and ("input_hint_block.0" not in layer)
	and ("input_hint_block.14" not in layer)
	}
	num_cond_embedding_blocks = len(cond_embedding_blocks)

	for idx in range(1, num_cond_embedding_blocks + 1):
	diffusers_idx = idx - 1
	cond_block_id = 2 * idx

	converted_state_dict[f"controlnet_cond_embedding.blocks.{diffusers_idx}.weight"] = state_dict.get(
	f"input_hint_block.{cond_block_id}.weight"
	)
	converted_state_dict[f"controlnet_cond_embedding.blocks.{diffusers_idx}.bias"] = state_dict.get(
	f"input_hint_block.{cond_block_id}.bias"
	)

	for key in [key for key in state_dict if "input_hint_block.0" in key]:
	diffusers_key = key.replace("input_hint_block.0", "controlnet_cond_embedding.conv_in")
	converted_state_dict[diffusers_key] = state_dict.get(key)

	for key in [key for key in state_dict if "input_hint_block.14" in key]:
	diffusers_key = key.replace("input_hint_block.14", "controlnet_cond_embedding.conv_out")
	converted_state_dict[diffusers_key] = state_dict.get(key)

	return converted_state_dict

	state_dict = _convert_controlnet_to_diffusers(state_dict)
	mapping = CONTROL_LORA_TO_DIFFUSERS
	return convert_state_dict(state_dict, mapping)


	def convert_all_state_dict_to_peft(state_dict):
	r"""
	Attempts to first `convert_state_dict_to_peft`, and if it doesn't detect `lora_linear_layer` for a valid
	`DIFFUSERS` LoRA for example, attempts to exclusively convert the Unet `convert_unet_state_dict_to_peft`
	"""
	try:
	peft_dict = convert_state_dict_to_peft(state_dict)
	except Exception as e:
	if str(e) == "Could not automatically infer state dict type":
	peft_dict = convert_unet_state_dict_to_peft(state_dict)
	else:
	raise

	if not any("lora_A" in key or "lora_B" in key for key in peft_dict.keys()):
	raise ValueError("Your LoRA was not converted to PEFT")

	return peft_dict


	def convert_state_dict_to_kohya(state_dict, original_type=None, **kwargs):
	r"""
	Converts a `PEFT` state dict to `Kohya` format that can be used in AUTOMATIC1111, ComfyUI, SD.Next, InvokeAI, etc.
	The method only supports the conversion from PEFT to Kohya for now.

	Args:
	state_dict (`dict[str, torch.Tensor]`):
	The state dict to convert.
	original_type (`StateDictType`, optional):
	The original type of the state dict, if not provided, the method will try to infer it automatically.
	kwargs (`dict`, args):
	Additional arguments to pass to the method.

	- adapter_name: For example, in case of PEFT, some keys will be prepended
	with the adapter name, therefore needs a special handling. By default PEFT also takes care of that in
	`get_peft_model_state_dict` method:
	https://github.com/huggingface/peft/blob/ba0477f2985b1ba311b83459d29895c809404e99/src/peft/utils/save_and_load.py#L92
	but we add it here in case we don't want to rely on that method.
	"""
	try:
	import torch
	except ImportError:
	logger.error("Converting PEFT state dicts to Kohya requires torch to be installed.")
	raise

	peft_adapter_name = kwargs.pop("adapter_name", None)
	if peft_adapter_name is not None:
	peft_adapter_name = "." + peft_adapter_name
	else:
	peft_adapter_name = ""

	if original_type is None:
	if any(f".lora_A{peft_adapter_name}.weight" in k for k in state_dict.keys()):
	original_type = StateDictType.PEFT

	if original_type not in KOHYA_STATE_DICT_MAPPINGS.keys():
	raise ValueError(f"Original type {original_type} is not supported")

	# Use the convert_state_dict function with the appropriate mapping
	kohya_ss_partial_state_dict = convert_state_dict(state_dict, KOHYA_STATE_DICT_MAPPINGS[StateDictType.PEFT])
	kohya_ss_state_dict = {}

	# Additional logic for replacing header, alpha parameters `.` with `_` in all keys
	for kohya_key, weight in kohya_ss_partial_state_dict.items():
	if "text_encoder_2." in kohya_key:
	kohya_key = kohya_key.replace("text_encoder_2.", "lora_te2.")
	elif "text_encoder." in kohya_key:
	kohya_key = kohya_key.replace("text_encoder.", "lora_te1.")
	elif "unet" in kohya_key:
	kohya_key = kohya_key.replace("unet", "lora_unet")
	elif "lora_magnitude_vector" in kohya_key:
	kohya_key = kohya_key.replace("lora_magnitude_vector", "dora_scale")

	kohya_key = kohya_key.replace(".", "_", kohya_key.count(".") - 2)
	kohya_key = kohya_key.replace(peft_adapter_name, "") # Kohya doesn't take names
	kohya_ss_state_dict[kohya_key] = weight
	if "lora_down" in kohya_key:
	alpha_key = f"{kohya_key.split('.')[0]}.alpha"
	kohya_ss_state_dict[alpha_key] = torch.tensor(len(weight))

	return kohya_ss_state_dict


	def state_dict_all_zero(state_dict, filter_str=None):
	if filter_str is not None:
	if isinstance(filter_str, str):
	filter_str = [filter_str]
	state_dict = {k: v for k, v in state_dict.items() if any(f in k for f in filter_str)}

	return all(torch.all(param == 0).item() for param in state_dict.values())


	def _load_sft_state_dict_metadata(model_file: str):
	import safetensors.torch

	from ..loaders.lora_base import LORA_ADAPTER_METADATA_KEY

	with safetensors.torch.safe_open(model_file, framework="pt", device="cpu") as f:
	metadata = f.metadata() or {}

	metadata.pop("format", None)
	if metadata:
	raw = metadata.get(LORA_ADAPTER_METADATA_KEY)
	return json.loads(raw) if raw else None
	else:
	return None