KokeCacao
/

mvdream-base-hf

Diffusers

English

MVDreamStableDiffusionPipeline

Model card Files Files and versions

xet

Community

Koke_Cacao commited on Nov 7, 2023

Commit

1a29d24

1 Parent(s): 9db01cb

:bug: fix for sd 2.1

Browse files

Files changed (1) hide show

scripts/convert_mvdream_to_diffusers.py +8 -386

scripts/convert_mvdream_to_diffusers.py CHANGED Viewed

@@ -27,105 +27,6 @@ from transformers import CLIPTokenizer, CLIPTextModel
 logger = logging.get_logger(__name__)
-# def create_unet_diffusers_config(original_config, image_size: int, controlnet=False):
-#     """
-#     Creates a config for the diffusers based on the config of the LDM model.
-#     """
-#     if controlnet:
-#         unet_params = original_config.model.params.control_stage_config.params
-#     else:
-#         if "unet_config" in original_config.model.params and original_config.model.params.unet_config is not None:
-#             unet_params = original_config.model.params.unet_config.params
-#         else:
-#             unet_params = original_config.model.params.network_config.params
-#     vae_params = original_config.model.params.first_stage_config.params.ddconfig
-#     block_out_channels = [unet_params.model_channels * mult for mult in unet_params.channel_mult]
-#     down_block_types = []
-#     resolution = 1
-#     for i in range(len(block_out_channels)):
-#         block_type = "CrossAttnDownBlock2D" if resolution in unet_params.attention_resolutions else "DownBlock2D"
-#         down_block_types.append(block_type)
-#         if i != len(block_out_channels) - 1:
-#             resolution *= 2
-#     up_block_types = []
-#     for i in range(len(block_out_channels)):
-#         block_type = "CrossAttnUpBlock2D" if resolution in unet_params.attention_resolutions else "UpBlock2D"
-#         up_block_types.append(block_type)
-#         resolution //= 2
-#     if unet_params.transformer_depth is not None:
-#         transformer_layers_per_block = (
-#             unet_params.transformer_depth
-#             if isinstance(unet_params.transformer_depth, int)
-#             else list(unet_params.transformer_depth)
-#         )
-#     else:
-#         transformer_layers_per_block = 1
-#     vae_scale_factor = 2 ** (len(vae_params.ch_mult) - 1)
-#     head_dim = unet_params.num_heads if "num_heads" in unet_params else None
-#     use_linear_projection = (
-#         unet_params.use_linear_in_transformer if "use_linear_in_transformer" in unet_params else False
-#     )
-#     if use_linear_projection:
-#         # stable diffusion 2-base-512 and 2-768
-#         if head_dim is None:
-#             head_dim_mult = unet_params.model_channels // unet_params.num_head_channels
-#             head_dim = [head_dim_mult * c for c in list(unet_params.channel_mult)]
-#     class_embed_type = None
-#     addition_embed_type = None
-#     addition_time_embed_dim = None
-#     projection_class_embeddings_input_dim = None
-#     context_dim = None
-#     if unet_params.context_dim is not None:
-#         context_dim = (
-#             unet_params.context_dim if isinstance(unet_params.context_dim, int) else unet_params.context_dim[0]
-#         )
-#     if "num_classes" in unet_params:
-#         if unet_params.num_classes == "sequential":
-#             if context_dim in [2048, 1280]:
-#                 # SDXL
-#                 addition_embed_type = "text_time"
-#                 addition_time_embed_dim = 256
-#             else:
-#                 class_embed_type = "projection"
-#             assert "adm_in_channels" in unet_params
-#             projection_class_embeddings_input_dim = unet_params.adm_in_channels
-#         else:
-#             raise NotImplementedError(f"Unknown conditional unet num_classes config: {unet_params.num_classes}")
-#     config = {
-#         "sample_size": image_size // vae_scale_factor,
-#         "in_channels": unet_params.in_channels,
-#         "down_block_types": tuple(down_block_types),
-#         "block_out_channels": tuple(block_out_channels),
-#         "layers_per_block": unet_params.num_res_blocks,
-#         "cross_attention_dim": context_dim,
-#         "attention_head_dim": head_dim,
-#         "use_linear_projection": use_linear_projection,
-#         "class_embed_type": class_embed_type,
-#         "addition_embed_type": addition_embed_type,
-#         "addition_time_embed_dim": addition_time_embed_dim,
-#         "projection_class_embeddings_input_dim": projection_class_embeddings_input_dim,
-#         "transformer_layers_per_block": transformer_layers_per_block,
-#     }
-#     if controlnet:
-#         config["conditioning_channels"] = unet_params.hint_channels
-#     else:
-#         config["out_channels"] = unet_params.out_channels
-#         config["up_block_types"] = tuple(up_block_types)
-#     return config
 def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
     """
@@ -190,291 +91,6 @@ def shave_segments(path, n_shave_prefix_segments=1):
         return ".".join(path.split(".")[:n_shave_prefix_segments])
-def renew_resnet_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside resnets to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item.replace("in_layers.0", "norm1")
-        new_item = new_item.replace("in_layers.2", "conv1")
-        new_item = new_item.replace("out_layers.0", "norm2")
-        new_item = new_item.replace("out_layers.3", "conv2")
-        new_item = new_item.replace("emb_layers.1", "time_emb_proj")
-        new_item = new_item.replace("skip_connection", "conv_shortcut")
-        new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-        mapping.append({"old": old_item, "new": new_item})
-    return mapping
-def renew_attention_paths(old_list, n_shave_prefix_segments=0):
-    """
-    Updates paths inside attentions to the new naming scheme (local renaming)
-    """
-    mapping = []
-    for old_item in old_list:
-        new_item = old_item
-        #         new_item = new_item.replace('norm.weight', 'group_norm.weight')
-        #         new_item = new_item.replace('norm.bias', 'group_norm.bias')
-        #         new_item = new_item.replace('proj_out.weight', 'proj_attn.weight')
-        #         new_item = new_item.replace('proj_out.bias', 'proj_attn.bias')
-        #         new_item = shave_segments(new_item, n_shave_prefix_segments=n_shave_prefix_segments)
-        mapping.append({"old": old_item, "new": new_item})
-    return mapping
-# def convert_ldm_unet_checkpoint(
-#     checkpoint, config, path=None, extract_ema=False, controlnet=False, skip_extract_state_dict=False
-# ):
-#     """
-#     Takes a state dict and a config, and returns a converted checkpoint.
-#     """
-#     if skip_extract_state_dict:
-#         unet_state_dict = checkpoint
-#     else:
-#         # extract state_dict for UNet
-#         unet_state_dict = {}
-#         keys = list(checkpoint.keys())
-#         if controlnet:
-#             unet_key = "control_model."
-#         else:
-#             unet_key = "model.diffusion_model."
-#         # at least a 100 parameters have to start with `model_ema` in order for the checkpoint to be EMA
-#         if sum(k.startswith("model_ema") for k in keys) > 100 and extract_ema:
-#             logger.warning(f"Checkpoint {path} has both EMA and non-EMA weights.")
-#             logger.warning(
-#                 "In this conversion only the EMA weights are extracted. If you want to instead extract the non-EMA"
-#                 " weights (useful to continue fine-tuning), please make sure to remove the `--extract_ema` flag."
-#             )
-#             for key in keys:
-#                 if key.startswith("model.diffusion_model"):
-#                     flat_ema_key = "model_ema." + "".join(key.split(".")[1:])
-#                     unet_state_dict[key.replace(unet_key, "")] = checkpoint[flat_ema_key]
-#         else:
-#             if sum(k.startswith("model_ema") for k in keys) > 100:
-#                 logger.warning(
-#                     "In this conversion only the non-EMA weights are extracted. If you want to instead extract the EMA"
-#                     " weights (usually better for inference), please make sure to add the `--extract_ema` flag."
-#                 )
-#             for key in keys:
-#                 if key.startswith(unet_key):
-#                     unet_state_dict[key.replace(unet_key, "")] = checkpoint[key]
-#     new_checkpoint = {}
-#     new_checkpoint["time_embedding.linear_1.weight"] = unet_state_dict["time_embed.0.weight"]
-#     new_checkpoint["time_embedding.linear_1.bias"] = unet_state_dict["time_embed.0.bias"]
-#     new_checkpoint["time_embedding.linear_2.weight"] = unet_state_dict["time_embed.2.weight"]
-#     new_checkpoint["time_embedding.linear_2.bias"] = unet_state_dict["time_embed.2.bias"]
-#     if config["class_embed_type"] is None:
-#         # No parameters to port
-#         ...
-#     elif config["class_embed_type"] == "timestep" or config["class_embed_type"] == "projection":
-#         new_checkpoint["class_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
-#         new_checkpoint["class_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
-#         new_checkpoint["class_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
-#         new_checkpoint["class_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
-#     else:
-#         raise NotImplementedError(f"Not implemented `class_embed_type`: {config['class_embed_type']}")
-#     if config["addition_embed_type"] == "text_time":
-#         new_checkpoint["add_embedding.linear_1.weight"] = unet_state_dict["label_emb.0.0.weight"]
-#         new_checkpoint["add_embedding.linear_1.bias"] = unet_state_dict["label_emb.0.0.bias"]
-#         new_checkpoint["add_embedding.linear_2.weight"] = unet_state_dict["label_emb.0.2.weight"]
-#         new_checkpoint["add_embedding.linear_2.bias"] = unet_state_dict["label_emb.0.2.bias"]
-#     new_checkpoint["conv_in.weight"] = unet_state_dict["input_blocks.0.0.weight"]
-#     new_checkpoint["conv_in.bias"] = unet_state_dict["input_blocks.0.0.bias"]
-#     if not controlnet:
-#         new_checkpoint["conv_norm_out.weight"] = unet_state_dict["out.0.weight"]
-#         new_checkpoint["conv_norm_out.bias"] = unet_state_dict["out.0.bias"]
-#         new_checkpoint["conv_out.weight"] = unet_state_dict["out.2.weight"]
-#         new_checkpoint["conv_out.bias"] = unet_state_dict["out.2.bias"]
-#     # Retrieves the keys for the input blocks only
-#     num_input_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "input_blocks" in layer})
-#     input_blocks = {
-#         layer_id: [key for key in unet_state_dict if f"input_blocks.{layer_id}" in key]
-#         for layer_id in range(num_input_blocks)
-#     }
-#     # Retrieves the keys for the middle blocks only
-#     num_middle_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "middle_block" in layer})
-#     middle_blocks = {
-#         layer_id: [key for key in unet_state_dict if f"middle_block.{layer_id}" in key]
-#         for layer_id in range(num_middle_blocks)
-#     }
-#     # Retrieves the keys for the output blocks only
-#     num_output_blocks = len({".".join(layer.split(".")[:2]) for layer in unet_state_dict if "output_blocks" in layer})
-#     output_blocks = {
-#         layer_id: [key for key in unet_state_dict if f"output_blocks.{layer_id}" in key]
-#         for layer_id in range(num_output_blocks)
-#     }
-#     for i in range(1, num_input_blocks):
-#         block_id = (i - 1) // (config["layers_per_block"] + 1)
-#         layer_in_block_id = (i - 1) % (config["layers_per_block"] + 1)
-#         resnets = [
-#             key for key in input_blocks[i] if f"input_blocks.{i}.0" in key and f"input_blocks.{i}.0.op" not in key
-#         ]
-#         attentions = [key for key in input_blocks[i] if f"input_blocks.{i}.1" in key]
-#         if f"input_blocks.{i}.0.op.weight" in unet_state_dict:
-#             new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.weight"] = unet_state_dict.pop(
-#                 f"input_blocks.{i}.0.op.weight"
-#             )
-#             new_checkpoint[f"down_blocks.{block_id}.downsamplers.0.conv.bias"] = unet_state_dict.pop(
-#                 f"input_blocks.{i}.0.op.bias"
-#             )
-#         paths = renew_resnet_paths(resnets)
-#         meta_path = {"old": f"input_blocks.{i}.0", "new": f"down_blocks.{block_id}.resnets.{layer_in_block_id}"}
-#         assign_to_checkpoint(
-#             paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-#         )
-#         if len(attentions):
-#             paths = renew_attention_paths(attentions)
-#             meta_path = {"old": f"input_blocks.{i}.1", "new": f"down_blocks.{block_id}.attentions.{layer_in_block_id}"}
-#             assign_to_checkpoint(
-#                 paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-#             )
-#     resnet_0 = middle_blocks[0]
-#     attentions = middle_blocks[1]
-#     resnet_1 = middle_blocks[2]
-#     resnet_0_paths = renew_resnet_paths(resnet_0)
-#     assign_to_checkpoint(resnet_0_paths, new_checkpoint, unet_state_dict, config=config)
-#     resnet_1_paths = renew_resnet_paths(resnet_1)
-#     assign_to_checkpoint(resnet_1_paths, new_checkpoint, unet_state_dict, config=config)
-#     attentions_paths = renew_attention_paths(attentions)
-#     meta_path = {"old": "middle_block.1", "new": "mid_block.attentions.0"}
-#     assign_to_checkpoint(
-#         attentions_paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-#     )
-#     for i in range(num_output_blocks):
-#         block_id = i // (config["layers_per_block"] + 1)
-#         layer_in_block_id = i % (config["layers_per_block"] + 1)
-#         output_block_layers = [shave_segments(name, 2) for name in output_blocks[i]]
-#         output_block_list = {}
-#         for layer in output_block_layers:
-#             layer_id, layer_name = layer.split(".")[0], shave_segments(layer, 1)
-#             if layer_id in output_block_list:
-#                 output_block_list[layer_id].append(layer_name)
-#             else:
-#                 output_block_list[layer_id] = [layer_name]
-#         if len(output_block_list) > 1:
-#             resnets = [key for key in output_blocks[i] if f"output_blocks.{i}.0" in key]
-#             attentions = [key for key in output_blocks[i] if f"output_blocks.{i}.1" in key]
-#             resnet_0_paths = renew_resnet_paths(resnets)
-#             paths = renew_resnet_paths(resnets)
-#             meta_path = {"old": f"output_blocks.{i}.0", "new": f"up_blocks.{block_id}.resnets.{layer_in_block_id}"}
-#             assign_to_checkpoint(
-#                 paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-#             )
-#             output_block_list = {k: sorted(v) for k, v in output_block_list.items()}
-#             if ["conv.bias", "conv.weight"] in output_block_list.values():
-#                 index = list(output_block_list.values()).index(["conv.bias", "conv.weight"])
-#                 new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.weight"] = unet_state_dict[
-#                     f"output_blocks.{i}.{index}.conv.weight"
-#                 ]
-#                 new_checkpoint[f"up_blocks.{block_id}.upsamplers.0.conv.bias"] = unet_state_dict[
-#                     f"output_blocks.{i}.{index}.conv.bias"
-#                 ]
-#                 # Clear attentions as they have been attributed above.
-#                 if len(attentions) == 2:
-#                     attentions = []
-#             if len(attentions):
-#                 paths = renew_attention_paths(attentions)
-#                 meta_path = {
-#                     "old": f"output_blocks.{i}.1",
-#                     "new": f"up_blocks.{block_id}.attentions.{layer_in_block_id}",
-#                 }
-#                 assign_to_checkpoint(
-#                     paths, new_checkpoint, unet_state_dict, additional_replacements=[meta_path], config=config
-#                 )
-#         else:
-#             resnet_0_paths = renew_resnet_paths(output_block_layers, n_shave_prefix_segments=1)
-#             for path in resnet_0_paths:
-#                 old_path = ".".join(["output_blocks", str(i), path["old"]])
-#                 new_path = ".".join(["up_blocks", str(block_id), "resnets", str(layer_in_block_id), path["new"]])
-#                 new_checkpoint[new_path] = unet_state_dict[old_path]
-#     if controlnet:
-#         # conditioning embedding
-#         orig_index = 0
-#         new_checkpoint["controlnet_cond_embedding.conv_in.weight"] = unet_state_dict.pop(
-#             f"input_hint_block.{orig_index}.weight"
-#         )
-#         new_checkpoint["controlnet_cond_embedding.conv_in.bias"] = unet_state_dict.pop(
-#             f"input_hint_block.{orig_index}.bias"
-#         )
-#         orig_index += 2
-#         diffusers_index = 0
-#         while diffusers_index < 6:
-#             new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.weight"] = unet_state_dict.pop(
-#                 f"input_hint_block.{orig_index}.weight"
-#             )
-#             new_checkpoint[f"controlnet_cond_embedding.blocks.{diffusers_index}.bias"] = unet_state_dict.pop(
-#                 f"input_hint_block.{orig_index}.bias"
-#             )
-#             diffusers_index += 1
-#             orig_index += 2
-#         new_checkpoint["controlnet_cond_embedding.conv_out.weight"] = unet_state_dict.pop(
-#             f"input_hint_block.{orig_index}.weight"
-#         )
-#         new_checkpoint["controlnet_cond_embedding.conv_out.bias"] = unet_state_dict.pop(
-#             f"input_hint_block.{orig_index}.bias"
-#         )
-#         # down blocks
-#         for i in range(num_input_blocks):
-#             new_checkpoint[f"controlnet_down_blocks.{i}.weight"] = unet_state_dict.pop(f"zero_convs.{i}.0.weight")
-#             new_checkpoint[f"controlnet_down_blocks.{i}.bias"] = unet_state_dict.pop(f"zero_convs.{i}.0.bias")
-#         # mid block
-#         new_checkpoint["controlnet_mid_block.weight"] = unet_state_dict.pop("middle_block_out.0.weight")
-#         new_checkpoint["controlnet_mid_block.bias"] = unet_state_dict.pop("middle_block_out.0.bias")
-#     return new_checkpoint
 def create_vae_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
@@ -706,8 +322,14 @@ def convert_from_original_mvdream_ckpt(checkpoint_path, original_config_file, de
     with init_empty_weights():
         vae = AutoencoderKL(**vae_config)
-    tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
-    text_encoder: CLIPTextModel = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device=torch.device("cuda:0")) # type: ignore
     for param_name, param in converted_vae_checkpoint.items():
         set_module_tensor_to_device(vae, param_name, "cuda:0", value=param)

 logger = logging.get_logger(__name__)
 def assign_to_checkpoint(paths, checkpoint, old_checkpoint, attention_paths_to_split=None, additional_replacements=None, config=None):
     """
         return ".".join(path.split(".")[:n_shave_prefix_segments])
 def create_vae_diffusers_config(original_config, image_size: int):
     """
     Creates a config for the diffusers based on the config of the LDM model.
     with init_empty_weights():
         vae = AutoencoderKL(**vae_config)
+    if original_config.model.params.unet_config.params.context_dim == 768:
+        tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
+        text_encoder: CLIPTextModel = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14").to(device=torch.device("cuda:0")) # type: ignore
+    elif original_config.model.params.unet_config.params.context_dim == 1024:
+        tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="tokenizer")
+        text_encoder: CLIPTextModel = CLIPTextModel.from_pretrained("stabilityai/stable-diffusion-2-1", subfolder="text_encoder").to(device=torch.device("cuda:0")) # type: ignore
+    else:
+        raise ValueError(f"Unknown context_dim: {original_config.model.paams.unet_config.params.context_dim}")
     for param_name, param in converted_vae_checkpoint.items():
         set_module_tensor_to_device(vae, param_name, "cuda:0", value=param)