Upload create-xllsd.py
Browse files- create-xllsd.py +60 -0
create-xllsd.py
ADDED
|
@@ -0,0 +1,60 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/env python
|
| 2 |
+
|
| 3 |
+
"""
|
| 4 |
+
This script pulls in the various standarad components for
|
| 5 |
+
an SD1.5 architecture model from DIFFERENT places.
|
| 6 |
+
It takes original SD1.5 base, but then pulls in the improved VAE
|
| 7 |
+
from SDXL, and then an improved "Long CLIP" text encoder from elsewhere
|
| 8 |
+
It then writes out a combined model in "diffusers" format.
|
| 9 |
+
That is more or less the contents of
|
| 10 |
+
https://huggingface.co/opendiffusionai/xllsd-alpha0
|
| 11 |
+
|
| 12 |
+
Feel free to use it for your own model creation experiments.
|
| 13 |
+
Of note to most people is that it pulls in the "float32" versions.
|
| 14 |
+
However, people with smaller hardware may wish to specify
|
| 15 |
+
torch_dtype=torch.float16
|
| 16 |
+
if they are just going to train in float16 or bf16 anyway
|
| 17 |
+
"""
|
| 18 |
+
|
| 19 |
+
from transformers import CLIPTextModel, CLIPTokenizer
|
| 20 |
+
|
| 21 |
+
from diffusers import StableDiffusionPipeline, AutoencoderKL
|
| 22 |
+
import torch
|
| 23 |
+
|
| 24 |
+
print("Loading main model")
|
| 25 |
+
# Load SD1.5 diffusers model in FP32
|
| 26 |
+
pipe = StableDiffusionPipeline.from_pretrained("stable-diffusion-v1-5/stable-diffusion-v1-5", torch_dtype=torch.float32)
|
| 27 |
+
|
| 28 |
+
print("Loading LONG CLIP")
|
| 29 |
+
# Load LongCLIP text encoder in FP32
|
| 30 |
+
clip_path = "zer0int/LongCLIP-GmP-ViT-L-14"
|
| 31 |
+
new_text_encoder = CLIPTextModel.from_pretrained(clip_path)
|
| 32 |
+
new_tokenizer = CLIPTokenizer.from_pretrained(clip_path)
|
| 33 |
+
|
| 34 |
+
print("Loading SDXL VAE")
|
| 35 |
+
new_vae = AutoencoderKL.from_pretrained(
|
| 36 |
+
"stabilityai/sdxl-vae",
|
| 37 |
+
torch_dtype=torch.float32
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
# Replace the text encoder and tokenizer
|
| 42 |
+
pipe.text_encoder = new_text_encoder
|
| 43 |
+
pipe.tokenizer = new_tokenizer
|
| 44 |
+
pipe.vae = new_vae
|
| 45 |
+
|
| 46 |
+
# Move the pipeline to GPU to confirm everything loads
|
| 47 |
+
print("Combining...")
|
| 48 |
+
pipe.to("cuda")
|
| 49 |
+
|
| 50 |
+
###############################################################
|
| 51 |
+
# Save the updated pipeline in Diffusers format
|
| 52 |
+
# IF you are going to convert to a single .safetensors, set safe_serialization False
|
| 53 |
+
# But if you are going to use in place, then set it to True
|
| 54 |
+
###############################################################
|
| 55 |
+
|
| 56 |
+
outname = "XLLsd_df"
|
| 57 |
+
pipe.save_pretrained(outname, safe_serialization=True)
|
| 58 |
+
|
| 59 |
+
print(f"Replaced text encoder and saved pipeline to {outname}")
|
| 60 |
+
|