model / convert_sd3.py

Upload 2 files

455acc7 verified over 1 year ago

8.95 kB

	import os
	import torch
	import numpy as np
	import gguf # This needs to be the llama.cpp one specifically!
	import argparse
	from tqdm import tqdm

	from safetensors.torch import load_file

	QUANTIZATION_THRESHOLD = 1024
	REARRANGE_THRESHOLD = 512
	MAX_TENSOR_NAME_LENGTH = 127


	class QuantError(Exception):
	pass

	class quants:
	@staticmethod
	def quantize(data, data_qtype):
	# Implement quantization logic here
	if data_qtype == GGMLQuantizationType.F16:
	return data.astype(np.float16)
	elif data_qtype == GGMLQuantizationType.BF16:
	return data.astype(np.float32) # BF16 is not supported by NumPy, so use float32 instead
	else:
	raise QuantError("Unsupported quantization type")

	class ModelTemplate:
	arch = "invalid" # string describing architecture
	shape_fix = False # whether to reshape tensors
	keys_detect = [] # list of lists to match in state dict
	keys_banned = [] # list of keys that should mark model as invalid for conversion

	class ModelFlux(ModelTemplate):
	arch = "fluxz"
	keys_detect = [
	("transformer_blocks.0.attn.norm_added_k.weight",),
	("double_blocks.0.img_attn.proj.weight",),
	]
	keys_banned = ["transformer_blocks.0.attn.norm_added_k.weight",]

	class ModelSD3(ModelTemplate):
	arch = "sd3"
	keys_detect = [
	("transformer_blocks.0.attn.add_q_proj.weight",),
	("joint_blocks.0.x_block.attn.qkv.weight",),
	]
	keys_banned = ["transformer_blocks.0.attn.add_q_proj.weight",]

	class ModelSDXL(ModelTemplate):
	arch = "sdxl"
	shape_fix = True
	keys_detect = [
	("down_blocks.0.downsamplers.0.conv.weight", "add_embedding.linear_1.weight",),
	(
	"input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight",
	"output_blocks.2.2.conv.weight", "output_blocks.5.2.conv.weight",
	), # Non-diffusers
	("label_emb.0.0.weight",),
	]

	class ModelSD1(ModelTemplate):
	arch = "sd1"
	shape_fix = False
	keys_detect = [
	("down_blocks.0.downsamplers.0.conv.weight",),
	(
	"input_blocks.3.0.op.weight", "input_blocks.6.0.op.weight", "input_blocks.9.0.op.weight",
	"output_blocks.2.1.conv.weight", "output_blocks.5.2.conv.weight", "output_blocks.8.2.conv.weight"
	), # Non-diffusers
	]

	# Prioritize ModelSD3 over ModelFlux
	arch_list = [ModelSD3, ModelFlux, ModelSDXL, ModelSD1]

	def is_model_arch(model, state_dict):
	# check if model is correct
	matched = False
	invalid = False
	for match_list in model.keys_detect:
	if all(key in state_dict for key in match_list):
	matched = True
	invalid = any(key in state_dict for key in model.keys_banned)
	break
	assert not invalid, "Model architecture not allowed for conversion! (i.e. reference VS diffusers format)"
	return matched

	def detect_arch(state_dict):
	model_arch = None
	for arch in arch_list:
	if is_model_arch(arch, state_dict):
	model_arch = arch
	break
	assert model_arch is not None, "Unknown model architecture!"
	return model_arch

	def parse_args():
	parser = argparse.ArgumentParser(description="Generate F16 GGUF files from single UNET")
	parser.add_argument("--src", required=True, help="Source model ckpt file.")
	parser.add_argument("--dst", help="Output unet gguf file.")
	args = parser.parse_args()

	if not os.path.isfile(args.src):
	parser.error("No input provided!")

	return args

	def load_state_dict(path):
	if any(path.endswith(x) for x in [".ckpt", ".pt", ".bin", ".pth"]):
	state_dict = torch.load(path, map_location="cpu", weights_only=True)
	state_dict = state_dict.get("model", state_dict)
	else:
	state_dict = load_file(path)

	# only keep unet with no prefix!
	sd = {}
	has_prefix = any(["model.diffusion_model." in x for x in state_dict.keys()])
	for k, v in state_dict.items():
	if has_prefix and "model.diffusion_model." not in k:
	continue
	if has_prefix:
	k = k.replace("model.diffusion_model.", "")
	sd[k] = v

	return sd

	def load_model(path):
	state_dict = load_state_dict(path)
	model_arch = detect_arch(state_dict)
	print(f"* Architecture detected from input: {model_arch.arch}")
	writer = gguf.GGUFWriter(path=None, arch=model_arch.arch)
	return (writer, state_dict, model_arch)

	def handle_tensors(args, writer, state_dict, model_arch):
	name_lengths = tuple(sorted(
	((key, len(key)) for key in state_dict.keys()),
	key=lambda item: item[1],
	reverse=True,
	))
	if not name_lengths:
	return
	max_name_len = name_lengths[0][1]
	if max_name_len > MAX_TENSOR_NAME_LENGTH:
	bad_list = ", ".join(f"{key!r} ({namelen})" for key, namelen in name_lengths if namelen > MAX_TENSOR_NAME_LENGTH)
	raise ValueError(f"Can only handle tensor names up to {MAX_TENSOR_NAME_LENGTH} characters. Tensors exceeding the limit: {bad_list}")
	for key, data in tqdm(state_dict.items()):
	old_dtype = data.dtype

	if data.dtype == torch.bfloat16:
	data = data.to(torch.float32).numpy()
	# this is so we don't break torch 2.0.X
	elif data.dtype in [getattr(torch, "float8_e4m3fn", "_invalid"), getattr(torch, "float8_e5m2", "_invalid")]:
	data = data.to(torch.float16).numpy()
	else:
	data = data.numpy()

	n_dims = len(data.shape)
	data_shape = data.shape
	data_qtype = getattr(
	gguf.GGMLQuantizationType,
	"BF16" if old_dtype == torch.bfloat16 else "F16"
	)

	# get number of parameters (AKA elements) in this tensor
	n_params = 1
	for dim_size in data_shape:
	n_params *= dim_size

	# keys to keep as max precision
	blacklist = {
	"time_embedding.",
	"add_embedding.",
	"time_in.",
	"txt_in.",
	"vector_in.",
	"img_in.",
	"guidance_in.",
	"final_layer.",
	}

	if old_dtype in (torch.float32, torch.bfloat16):
	if n_dims == 1:
	# one-dimensional tensors should be kept in F32
	# also speeds up inference due to not dequantizing
	data_qtype = gguf.GGMLQuantizationType.F32

	elif n_params <= QUANTIZATION_THRESHOLD:
	# very small tensors
	data_qtype = gguf.GGMLQuantizationType.F32

	elif ".weight" in key and any(x in key for x in blacklist):
	data_qtype = gguf.GGMLQuantizationType.F32

	if (model_arch.shape_fix # NEVER reshape for models such as flux
	and n_dims > 1 # Skip one-dimensional tensors
	and n_params >= REARRANGE_THRESHOLD # Only rearrange tensors meeting the size requirement
	and (n_params / 256).is_integer() # Rearranging only makes sense if total elements is divisible by 256
	and not (data.shape[-1] / 256).is_integer() # Only need to rearrange if the last dimension is not divisible by 256
	):
	orig_shape = data.shape
	data = data.reshape(n_params // 256, 256)
	writer.add_array(f"comfy.gguf.orig_shape.{key}", tuple(int(dim) for dim in orig_shape))

	new_name = key # do we need to rename?

	shape_str = f"{{{', '.join(str(n) for n in reversed(data.shape))}}}"
	tqdm.write(f"{f'%-{max_name_len + 4}s' % f'{new_name}'} {old_dtype} --> {data_qtype.name}, shape = {shape_str}")

	writer.add_tensor(new_name, data, raw_dtype=data_qtype)



	def load_model(path):
	state_dict = load_state_dict(path)
	model_arch = detect_arch(state_dict)
	print(f"* Architecture detected from input: {model_arch.arch}")
	return state_dict, model_arch

	...

	if __name__ == "__main__":
	args = parse_args()
	path = args.src
	state_dict, model_arch = load_model(path)

	if next(iter(state_dict.values())).dtype == torch.bfloat16:
	out_path = f"{os.path.splitext(path)[0]}-BF16.gguf"
	else:
	out_path = f"{os.path.splitext(path)[0]}-F16.gguf"

	out_path = args.dst or out_path
	if os.path.isfile(out_path):
	input("Output exists enter to continue or ctrl+c to abort!")

	writer = gguf.GGUFWriter(path=out_path, arch=model_arch.arch)
	writer.add_quantization_version(1)

	handle_tensors(args, writer, state_dict, model_arch)
	writer.write_header_to_file()
	writer.write_kv_data_to_file()
	writer.write_tensors_to_file()
	writer.close()