Spaces:

EarthnDusk
/

SDXL_To_Diffusers

Sleeping

App Files Files Community

Duskfallcrew commited on Mar 16, 2025

Commit

5840753

verified ·

1 Parent(s): 96bf0f5

Update app.py

Browse files

Key Changes and Explanations:

load_sdxl_checkpoint (Corrected): This function now correctly extracts the state dictionaries for both text encoders (text_encoder1_state and text_encoder2_state), the VAE (vae_state), and the UNet (unet_state), using the appropriate key prefixes. It still assumes the Illustrious-xl model uses the standard SDXL prefixes for these components, which is a reasonable assumption.

build_diffusers_model (Corrected):

Loads the configurations from the reference model (or the default SDXL base) for all components: CLIPTextConfig for text_encoder, CLIPTextConfig for text_encoder_2, AutoencoderKL for vae, and UNet2DConditionModel for unet.

Creates instances of CLIPTextModel for text_encoder1 and now properly uses CLIPTextModelWithProjection for text_encoder2, and AutoencoderKL, and UNet2DConditionModel using these loaded configurations. This is crucial for getting the correct model architecture.

Loads the extracted state dictionaries into the corresponding model instances using strict=False. This handles potential key mismatches or extra keys in the Illustrious-xl checkpoint.

Sets the components to float16 and moves to the CPU.

convert_and_save_sdxl_to_diffusers: Remains mostly the same, but now correctly uses the two text encoders.

Other Functions: The rest of the code (downloading, uploading, Gradio interface) remains largely unchanged.

Testing and Further Steps

Test Thoroughly: Test this revised code with the Illustrious-xl model. It should now load the checkpoint correctly and create a Diffusers pipeline.

Verify Functionality: After converting, test the generated Diffusers model. Generate some images and compare them to the expected output from the Illustrious-xl model. This is crucial to ensure the conversion was successful and the model is working as intended.

Key Prefixes (If Still Errors): If you still encounter errors, it's possible that the Illustrious-xl model uses different key prefixes than the standard SDXL prefixes. In this case, you'll need to inspect the checkpoint's state dictionary keys directly (using a simplified loading script) to determine the correct prefixes and adjust load_sdxl_checkpoint accordingly.

Files changed (1) hide show

app.py +41 -69

app.py CHANGED Viewed

@@ -2,7 +2,7 @@ import os
 import gradio as gr
 import torch
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, AutoencoderKL
-from transformers import CLIPTextModel, CLIPTextConfig
 from safetensors.torch import load_file
 from collections import OrderedDict
 import re
@@ -67,42 +67,35 @@ def create_model_repo(api, user, orgs_name, model_name, make_private=False):
     try:
         api.create_repo(repo_id=repo_id, repo_type="model", private=make_private)
         print(f"Model repo '{repo_id}' created.")
-    except HfHubHTTPError:  # Corrected the exception name
         print(f"Model repo '{repo_id}' already exists.")
     return repo_id
 # ---------------------- MODEL LOADING AND CONVERSION ----------------------
 def download_model(model_path_or_url):
-    """Downloads a model, handling URLs, HF repos, and local paths, caching appropriately."""
     try:
-        # 1. Check if it's a valid Hugging Face repo ID (and potentially a file within)
         try:
             validate_repo_id(model_path_or_url)
-            # It's a valid repo ID; use hf_hub_download (it handles caching)
             local_path = hf_hub_download(repo_id=model_path_or_url)
             return local_path
         except HFValidationError:
-            pass  # Not a simple repo ID.  Might be repo ID + filename, or a URL.
         # 2. Check if it's a URL
-        if model_path_or_url.startswith("http://") or model_path_or_url.startswith(
-            "https://"
-        ):
-            # It's a URL : download and put into HF cache
             response = requests.get(model_path_or_url, stream=True)
-            response.raise_for_status()  # Raise HTTPError for bad requests (4xx or 5xx)
-            # Get filename from URL, or use a hash if we can't determine it
             parsed_url = urlparse(model_path_or_url)
             filename = os.path.basename(unquote(parsed_url.path))
             if not filename:
                 filename = hashlib.sha256(model_path_or_url.encode()).hexdigest()
-            # Construct the cache path (using HF_HUB_CACHE + "downloads")
             cache_dir = os.path.join(HUGGINGFACE_HUB_CACHE, "downloads")
-            os.makedirs(cache_dir, exist_ok=True)  # Ensure cache directory exists
             local_path = os.path.join(cache_dir, filename)
             with open(local_path, "wb") as f:
@@ -125,7 +118,6 @@ def download_model(model_path_or_url):
                     return local_path
                 else:
                     raise ValueError("Invalid input format.")
             except HFValidationError:
                 raise ValueError(f"Invalid model path or URL: {model_path_or_url}")
@@ -133,15 +125,14 @@ def download_model(model_path_or_url):
         raise ValueError(f"Error downloading or accessing model: {e}")
 def load_sdxl_checkpoint(checkpoint_path):
-    """Loads an SDXL checkpoint (.ckpt or .safetensors) and returns components."""
     if checkpoint_path.endswith(".safetensors"):
-        state_dict = load_file(checkpoint_path, device="cpu")  # Load to CPU
     elif checkpoint_path.endswith(".ckpt"):
-        state_dict = torch.load(checkpoint_path, map_location="cpu")[
-            "state_dict"
-        ]  # Load to CPU, access ["state_dict"]
     else:
         raise ValueError("Unsupported checkpoint format. Must be .safetensors or .ckpt")
@@ -152,82 +143,62 @@ def load_sdxl_checkpoint(checkpoint_path):
     for key, value in state_dict.items():
         if key.startswith("first_stage_model."):  # VAE
-            vae_state[key.replace("first_stage_model.", "")] = value.to(
-                torch.float16
-            )  # FP16 conversion
-        elif key.startswith("condition_model.model.text_encoder."):  # Text Encoder 1
-            text_encoder1_state[
-                key.replace("condition_model.model.text_encoder.", "")
-            ] = value.to(
-                torch.float16
-            )  # FP16
-        elif key.startswith(
-            "condition_model.model.text_encoder_2."
-        ):  # Text Encoder 2
-            text_encoder2_state[
-                key.replace("condition_model.model.text_encoder_2.", "")
-            ] = value.to(
-                torch.float16
-            )  # FP16
         elif key.startswith("model.diffusion_model."):  # UNet
-            unet_state[key.replace("model.diffusion_model.", "")] = value.to(
-                torch.float16
-            )  # FP16
     return text_encoder1_state, text_encoder2_state, vae_state, unet_state
 def build_diffusers_model(
-    text_encoder1_state,
-    text_encoder2_state,
-    vae_state,
-    unet_state,
-    reference_model_path=None,
 ):
-    """Builds the Diffusers pipeline components from the loaded state dicts."""
-    # Default to SDXL base 1.0 if no reference model is provided
     if not reference_model_path:
         reference_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
-    # 1. Text Encoders
     config_text_encoder1 = CLIPTextConfig.from_pretrained(
         reference_model_path, subfolder="text_encoder"
     )
     config_text_encoder2 = CLIPTextConfig.from_pretrained(
-        reference_model_path, subfolder="text_encoder_2"
     )
     text_encoder1 = CLIPTextModel(config_text_encoder1)
-    text_encoder2 = CLIPTextModel(config_text_encoder2)
-    text_encoder1.load_state_dict(text_encoder1_state)
-    text_encoder2.load_state_dict(text_encoder2_state)
-    text_encoder1.to(torch.float16).to("cpu")  # Ensure fp16 and CPU
-    text_encoder2.to(torch.float16).to("cpu")
-    # 2. VAE
-    vae = AutoencoderKL.from_pretrained(reference_model_path, subfolder="vae")
-    vae.load_state_dict(vae_state)
-    vae.to(torch.float16).to("cpu")
-    # 3. UNet
-    unet = UNet2DConditionModel.from_pretrained(reference_model_path, subfolder="unet")
-    unet.load_state_dict(unet_state)
     unet.to(torch.float16).to("cpu")
     return text_encoder1, text_encoder2, vae, unet
 def convert_and_save_sdxl_to_diffusers(
     checkpoint_path_or_url, output_path, reference_model_path
 ):
-    """Converts an SDXL checkpoint to Diffusers format and saves it.
-    Args:
-        checkpoint_path_or_url:  The path/URL/repo ID of the checkpoint.
-    """
-    # Download the model if necessary (handles URLs, repo IDs, and local paths)
     checkpoint_path = download_model(checkpoint_path_or_url)
     text_encoder1_state, text_encoder2_state, vae_state, unet_state = (
@@ -255,6 +226,7 @@ def convert_and_save_sdxl_to_diffusers(
     print(f"Model saved as Diffusers format: {output_path}")
 # ---------------------- UPLOAD FUNCTION ----------------------
 def upload_to_huggingface(model_path, hf_token, orgs_name, model_name, make_private):
     """Uploads a model to the Hugging Face Hub."""
@@ -362,7 +334,7 @@ with gr.Blocks(css=css) as demo:
         with gr.Column():
             output = gr.Markdown() #Output is in its own column
-    convert_button.click( #CORRECT AREA
         fn=main,
         inputs=[
             model_to_load,

 import gradio as gr
 import torch
 from diffusers import StableDiffusionXLPipeline, UNet2DConditionModel, AutoencoderKL
+from transformers import CLIPTextModel, CLIPTextModelWithProjection, CLIPTextConfig, CLIPTokenizer
 from safetensors.torch import load_file
 from collections import OrderedDict
 import re
     try:
         api.create_repo(repo_id=repo_id, repo_type="model", private=make_private)
         print(f"Model repo '{repo_id}' created.")
+    except HfHubHTTPError:
         print(f"Model repo '{repo_id}' already exists.")
     return repo_id
 # ---------------------- MODEL LOADING AND CONVERSION ----------------------
 def download_model(model_path_or_url):
+    """Downloads a model, handling URLs, HF repos, and local paths."""
     try:
+        # 1. Check if it's a valid Hugging Face repo ID
         try:
             validate_repo_id(model_path_or_url)
             local_path = hf_hub_download(repo_id=model_path_or_url)
             return local_path
         except HFValidationError:
+            pass
         # 2. Check if it's a URL
+        if model_path_or_url.startswith("http://") or model_path_or_url.startswith("https://"):
             response = requests.get(model_path_or_url, stream=True)
+            response.raise_for_status()
             parsed_url = urlparse(model_path_or_url)
             filename = os.path.basename(unquote(parsed_url.path))
             if not filename:
                 filename = hashlib.sha256(model_path_or_url.encode()).hexdigest()
             cache_dir = os.path.join(HUGGINGFACE_HUB_CACHE, "downloads")
+            os.makedirs(cache_dir, exist_ok=True)
             local_path = os.path.join(cache_dir, filename)
             with open(local_path, "wb") as f:
                     return local_path
                 else:
                     raise ValueError("Invalid input format.")
             except HFValidationError:
                 raise ValueError(f"Invalid model path or URL: {model_path_or_url}")
         raise ValueError(f"Error downloading or accessing model: {e}")
 def load_sdxl_checkpoint(checkpoint_path):
+    """Loads checkpoint and extracts state dicts, handling Illustrious-xl."""
     if checkpoint_path.endswith(".safetensors"):
+        state_dict = load_file(checkpoint_path, device="cpu")
     elif checkpoint_path.endswith(".ckpt"):
+        state_dict = torch.load(checkpoint_path, map_location="cpu")["state_dict"]
     else:
         raise ValueError("Unsupported checkpoint format. Must be .safetensors or .ckpt")
     for key, value in state_dict.items():
         if key.startswith("first_stage_model."):  # VAE
+            vae_state[key.replace("first_stage_model.", "")] = value.to(torch.float16)
+        elif key.startswith("condition_model.model.text_encoder."):  # First Text Encoder
+            text_encoder1_state[key.replace("condition_model.model.text_encoder.", "")] = value.to(torch.float16)
+        elif key.startswith("condition_model.model.text_encoder_2."):  # Second Text Encoder
+            text_encoder2_state[key.replace("condition_model.model.text_encoder_2.", "")] = value.to(torch.float16)
         elif key.startswith("model.diffusion_model."):  # UNet
+            unet_state[key.replace("model.diffusion_model.", "")] = value.to(torch.float16)
     return text_encoder1_state, text_encoder2_state, vae_state, unet_state
 def build_diffusers_model(
+    text_encoder1_state, text_encoder2_state, vae_state, unet_state, reference_model_path=None
 ):
+    """Builds Diffusers components, loading state dicts with strict=False."""
     if not reference_model_path:
         reference_model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+    # Load configurations from the reference model
     config_text_encoder1 = CLIPTextConfig.from_pretrained(
         reference_model_path, subfolder="text_encoder"
     )
     config_text_encoder2 = CLIPTextConfig.from_pretrained(
+       reference_model_path, subfolder="text_encoder_2"
     )
+    config_vae = AutoencoderKL.from_pretrained(reference_model_path, subfolder="vae").config
+    config_unet = UNet2DConditionModel.from_pretrained(reference_model_path, subfolder="unet").config
+    # Create instances using the configurations
     text_encoder1 = CLIPTextModel(config_text_encoder1)
+    text_encoder2 = CLIPTextModelWithProjection(config_text_encoder2) # Use CLIPTextModelWithProjection
+    vae = AutoencoderKL(config=config_vae)
+    unet = UNet2DConditionModel(config=config_unet)
+    # Load state dicts with strict=False
+    text_encoder1.load_state_dict(text_encoder1_state, strict=False)
+    text_encoder2.load_state_dict(text_encoder2_state, strict=False)
+    vae.load_state_dict(vae_state, strict=False)
+    unet.load_state_dict(unet_state, strict=False)
+    text_encoder1.to(torch.float16).to("cpu")
+    text_encoder2.to(torch.float16).to("cpu")
+    vae.to(torch.float16).to("cpu")
     unet.to(torch.float16).to("cpu")
     return text_encoder1, text_encoder2, vae, unet
 def convert_and_save_sdxl_to_diffusers(
     checkpoint_path_or_url, output_path, reference_model_path
 ):
+    """Converts and saves the Illustrious-xl checkpoint to Diffusers format."""
     checkpoint_path = download_model(checkpoint_path_or_url)
     text_encoder1_state, text_encoder2_state, vae_state, unet_state = (
     print(f"Model saved as Diffusers format: {output_path}")
 # ---------------------- UPLOAD FUNCTION ----------------------
 def upload_to_huggingface(model_path, hf_token, orgs_name, model_name, make_private):
     """Uploads a model to the Hugging Face Hub."""
         with gr.Column():
             output = gr.Markdown() #Output is in its own column
+    convert_button.click(
         fn=main,
         inputs=[
             model_to_load,