Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes. See raw diff
- .DS_Store +0 -0
- .gitattributes +43 -16
- README.md +751 -3
- fix_upload_issues.md +273 -0
- models/.DS_Store +0 -0
- models/control_v11p_sd21_canny/config.json +0 -0
- models/control_v11p_sd21_canny/config_paddle.json +0 -0
- models/control_v11p_sd21_canny/diffusion_pytorch_model.safetensors +0 -0
- models/control_v11p_sd21_canny/gitattributes +0 -0
- models/control_v11p_sd21_canny_paddle/README.md +3 -0
- models/control_v11p_sd21_canny_paddle/config.json +52 -0
- models/control_v11p_sd21_canny_paddle/conversion_guide.md +43 -0
- models/control_v11p_sd21_canny_paddle/conversion_status.md +1 -0
- models/control_v11p_sd21_canny_paddle/diffusion_pytorch_model.safetensors +3 -0
- models/control_v11p_sd21_canny_paddle/gitattributes +35 -0
- models/control_v11p_sd21_canny_paddle/model_info.json +1 -0
- models/control_v11p_sd21_canny_paddle/weight_conversion_status.json +12 -0
- models/finetuned/thangka_21_ACD_250.safetensors +3 -0
- models/finetuned/thangka_21_Status_140.safetensors +3 -0
- models/finetuned_paddle/.DS_Store +0 -0
- models/finetuned_paddle/README.md +3 -0
- models/finetuned_paddle/conversion_guide.md +32 -0
- models/finetuned_paddle/model_info.json +1 -0
- models/finetuned_paddle/thangka_21_ACD_250/model.pdparams +3 -0
- models/finetuned_paddle/thangka_21_ACD_250/model_info.json +7 -0
- models/finetuned_paddle/thangka_21_ACD_250_paddle.pdparams +3 -0
- models/finetuned_paddle/thangka_21_Status_140/model.pdparams +3 -0
- models/finetuned_paddle/thangka_21_Status_140/model_info.json +7 -0
- models/finetuned_paddle/thangka_21_Status_140_paddle.pdparams +3 -0
- models/sd2.1_base/.DS_Store +0 -0
- models/sd2.1_base/README.md +187 -0
- models/sd2.1_base/feature_extractor/preprocessor_config.json +20 -0
- models/sd2.1_base/gitattributes +34 -0
- models/sd2.1_base/model_index.json +33 -0
- models/sd2.1_base/model_index_paddle.json +33 -0
- models/sd2.1_base/scheduler/scheduler_config.json +14 -0
- models/sd2.1_base/text_encoder/config.json +25 -0
- models/sd2.1_base/text_encoder/model.safetensors +3 -0
- models/sd2.1_base/tokenizer/merges.txt +0 -0
- models/sd2.1_base/tokenizer/special_tokens_map.json +24 -0
- models/sd2.1_base/tokenizer/tokenizer_config.json +34 -0
- models/sd2.1_base/tokenizer/vocab.json +0 -0
- models/sd2.1_base/unet/config.json +45 -0
- models/sd2.1_base/unet/diffusion_pytorch_model.safetensors +3 -0
- models/sd2.1_base/v2-1_512-nonema-pruned.safetensors +3 -0
- models/sd2.1_base/vae/config.json +29 -0
- models/sd2.1_base/vae/diffusion_pytorch_model.safetensors +3 -0
- models/sd2.1_base_paddle/README.md +187 -0
- models/sd2.1_base_paddle/config.json +38 -0
- models/sd2.1_base_paddle/conversion_guide.md +43 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
.gitattributes
CHANGED
|
@@ -1,35 +1,62 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.
|
| 34 |
-
*.
|
| 35 |
-
*
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.pdparams filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.pdmodel filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.jpg filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.mpg filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.png filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.bin.* filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.iso filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.7zip filter=lfs diff=lfs merge=lfs -text
|
| 13 |
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.tar.gz filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.avi filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.exe filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.bmp filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pptx filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.ppt filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.doc filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.docx filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.xls filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.xlsx filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
*.pdf filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.msi filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.jar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.ico filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.gif filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wmv filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.checkpoint filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.arrow filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 35 |
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 36 |
*.ftz filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 37 |
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 38 |
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 39 |
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 40 |
*.model filter=lfs diff=lfs merge=lfs -text
|
| 41 |
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
| 42 |
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 43 |
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 44 |
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 45 |
*.pb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
| 46 |
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 47 |
*.pth filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
| 48 |
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 49 |
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 50 |
*.tgz filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 51 |
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
*.zstandard filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
*.tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
*.db* filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
*.ark* filter=lfs diff=lfs merge=lfs -text
|
| 56 |
+
**/*ckpt*data* filter=lfs diff=lfs merge=lfs -text
|
| 57 |
+
**/*ckpt*.meta filter=lfs diff=lfs merge=lfs -text
|
| 58 |
+
**/*ckpt*.index filter=lfs diff=lfs merge=lfs -text
|
| 59 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 60 |
+
*.pdiparams filter=lfs diff=lfs merge=lfs -text
|
| 61 |
+
label_dict.* filter=lfs diff=lfs merge=lfs -text
|
| 62 |
+
*.gguf filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,751 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
language:
|
| 4 |
+
- zh
|
| 5 |
+
- en
|
| 6 |
+
tags:
|
| 7 |
+
- thangka
|
| 8 |
+
- image-restoration
|
| 9 |
+
- stable-diffusion
|
| 10 |
+
- lora
|
| 11 |
+
- cultural-heritage
|
| 12 |
+
- paddlepaddle
|
| 13 |
+
- buddhist-art
|
| 14 |
+
datasets:
|
| 15 |
+
- custom-thangka-1376
|
| 16 |
+
metrics:
|
| 17 |
+
- psnr
|
| 18 |
+
- ssim
|
| 19 |
+
pipeline_tag: image-to-image
|
| 20 |
+
widget:
|
| 21 |
+
- text: "traditional thangka art, Shakyamuni Buddha, detailed painting, vibrant colors, gold outlines"
|
| 22 |
+
example_title: "Buddha Restoration"
|
| 23 |
+
- text: "traditional thangka art, Green Tara, 18th century Tibetan style, mineral pigments, masterpiece"
|
| 24 |
+
example_title: "Tara Restoration"
|
| 25 |
+
---
|
| 26 |
+
|
| 27 |
+
# 🎨 唐卡修复AI模型 / Thangka Restoration AI Models
|
| 28 |
+
|
| 29 |
+
<div align="center">
|
| 30 |
+
|
| 31 |
+
[](https://github.com/WangchukMind/thangka-restoration-ai)
|
| 32 |
+
[](LICENSE)
|
| 33 |
+
[](https://paddlepaddle.org.cn)
|
| 34 |
+
[](https://huggingface.co/Wangchuk1376)
|
| 35 |
+
|
| 36 |
+
**专门用于藏传佛教唐卡艺术修复的AI模型集合**
|
| 37 |
+
|
| 38 |
+
[English](#english-version) | [中文](#chinese-version)
|
| 39 |
+
|
| 40 |
+
</div>
|
| 41 |
+
|
| 42 |
+
---
|
| 43 |
+
|
| 44 |
+
## <a name="chinese-version"></a>🌟 项目简介
|
| 45 |
+
|
| 46 |
+
这是一套专门用于藏传佛教唐卡艺术修复的AI模型集合,基于**Stable Diffusion 2.1**和**LoRA微调技术**,在**1376幅**专业标注的唐卡图像上训练而成。
|
| 47 |
+
|
| 48 |
+
### 核心特点
|
| 49 |
+
|
| 50 |
+
- ✅ **专业训练**: 1376幅高质量唐卡图像数据集
|
| 51 |
+
- ✅ **文化准确**: 保持传统唐卡艺术特征,文化准确性>95%
|
| 52 |
+
- ✅ **高效修复**: 基于LoRA技术,快速适应不同风格
|
| 53 |
+
- ✅ **多种模型**: 提供多个LoRA模型,适应不同修复需求
|
| 54 |
+
- ✅ **PaddlePaddle**: 完全适配PaddlePaddle深度学习框架
|
| 55 |
+
|
| 56 |
+
### 开发信息
|
| 57 |
+
|
| 58 |
+
- **开发者**: Wangchuk Mind
|
| 59 |
+
- **机构**: 四川大学计算机学院
|
| 60 |
+
- **框架**: PaddlePaddle 2.6.2
|
| 61 |
+
- **基础模型**: Stable Diffusion 2.1
|
| 62 |
+
- **许可证**: MIT License
|
| 63 |
+
|
| 64 |
+
---
|
| 65 |
+
|
| 66 |
+
## 📦 模型列表
|
| 67 |
+
|
| 68 |
+
### 1. 基础模型
|
| 69 |
+
|
| 70 |
+
#### Stable Diffusion 2.1 Base (PaddlePaddle版)
|
| 71 |
+
- **路径**: `models/sd2.1_base_paddle/`
|
| 72 |
+
- **参数量**: 1.4B
|
| 73 |
+
- **组成部分**:
|
| 74 |
+
- UNet: 图像去噪网络
|
| 75 |
+
- VAE: 变分自编码器
|
| 76 |
+
- Text Encoder: 文本编码器
|
| 77 |
+
- Tokenizer: 分词器
|
| 78 |
+
- **用途**: 基础图像生成和修复
|
| 79 |
+
- **输入分辨率**: 512×512 (标准), 768×768, 1024×1024
|
| 80 |
+
|
| 81 |
+
#### ControlNet Canny (PaddlePaddle版)
|
| 82 |
+
- **路径**: `models/control_v11p_sd21_canny_paddle/`
|
| 83 |
+
- **参数量**: 361M
|
| 84 |
+
- **功能**: 边缘引导的精确控制
|
| 85 |
+
- **用途**:
|
| 86 |
+
- 保持图像结构和线条
|
| 87 |
+
- 精确控制修复区域
|
| 88 |
+
- 适合线条清晰的唐卡修复
|
| 89 |
+
|
| 90 |
+
### 2. LoRA微调模型
|
| 91 |
+
|
| 92 |
+
#### thangka_21_Status_140 ⭐ (推荐)
|
| 93 |
+
- **文件**: `models/finetuned/thangka_21_Status_140.safetensors`
|
| 94 |
+
- **格式**: SafeTensors
|
| 95 |
+
- **大小**: ~20MB
|
| 96 |
+
- **训练步数**: 140 epochs
|
| 97 |
+
- **LoRA参数**:
|
| 98 |
+
- Rank: 8
|
| 99 |
+
- Alpha: 16
|
| 100 |
+
- **推荐用途**: 标准唐卡修复
|
| 101 |
+
- **适用风格**:
|
| 102 |
+
- 18世纪西藏风格
|
| 103 |
+
- 传统佛教本尊唐卡
|
| 104 |
+
- 常规损伤修复
|
| 105 |
+
|
| 106 |
+
#### thangka_21_ACD_250
|
| 107 |
+
- **文件**: `models/finetuned/thangka_21_ACD_250.safetensors`
|
| 108 |
+
- **格式**: SafeTensors
|
| 109 |
+
- **大小**: ~20MB
|
| 110 |
+
- **训练步数**: 250 epochs
|
| 111 |
+
- **LoRA参数**:
|
| 112 |
+
- Rank: 8
|
| 113 |
+
- Alpha: 16
|
| 114 |
+
- **推荐用途**: 高质量细节修复
|
| 115 |
+
- **适用风格**:
|
| 116 |
+
- 精细绘制的唐卡
|
| 117 |
+
- 复杂图案修复
|
| 118 |
+
- 高级颜色还原
|
| 119 |
+
|
| 120 |
+
### 3. PaddlePaddle专用模型
|
| 121 |
+
|
| 122 |
+
位于 `models/finetuned_paddle/` 和 `models/sd2.1_base_paddle/`,这些是转换为PaddlePaddle格式的模型文件(`.pdparams`),可直接在PaddlePaddle框架中使用。
|
| 123 |
+
|
| 124 |
+
---
|
| 125 |
+
|
| 126 |
+
## 🎓 训练数据集
|
| 127 |
+
|
| 128 |
+
### 数据集概述
|
| 129 |
+
|
| 130 |
+
- **总量**: 1376幅高质量唐卡图像
|
| 131 |
+
- **平均分辨率**: 2048×2048像素
|
| 132 |
+
- **格式**: PNG, JPG
|
| 133 |
+
- **来源**:
|
| 134 |
+
- 实验室专业采集
|
| 135 |
+
- 博物馆授权数字化
|
| 136 |
+
- 专业摄影收集
|
| 137 |
+
|
| 138 |
+
### 标注体系
|
| 139 |
+
|
| 140 |
+
#### 1. 艺术风格分类
|
| 141 |
+
- **地域风格**: 西藏、尼泊尔、蒙古、青海、四川
|
| 142 |
+
- **时代风格**: 18世纪、19世纪、当代
|
| 143 |
+
- **画派**: 卫藏画派、康巴画派、安多画派
|
| 144 |
+
|
| 145 |
+
#### 2. 题材分类
|
| 146 |
+
- **佛教本尊**: 释迦牟尼佛、观音菩萨、文殊菩萨、绿度母等
|
| 147 |
+
- **护法神**: 四臂观音、玛哈嘎拉、大威德金刚等
|
| 148 |
+
- **坛城**: 时轮金刚坛城、胜乐金刚坛城等
|
| 149 |
+
- **历史人物**: 宗喀巴大师、莲花生大师等
|
| 150 |
+
|
| 151 |
+
#### 3. 技术参数标注
|
| 152 |
+
- **颜色特征**: 矿物颜料类型、色彩饱和度、明度
|
| 153 |
+
- **构图分析**: 主尊位置、周边布局、对称性
|
| 154 |
+
- **损伤类型**: 磨损、褪色、破损、污渍、开裂
|
| 155 |
+
|
| 156 |
+
#### 4. 文化信息标注
|
| 157 |
+
- **宗教意义**: 佛教内涵、象征意义
|
| 158 |
+
- **历史背景**: 创作年代、流派传承
|
| 159 |
+
- **艺术特色**: 绘制技法、风格特征
|
| 160 |
+
|
| 161 |
+
---
|
| 162 |
+
|
| 163 |
+
## 💻 使用方法
|
| 164 |
+
|
| 165 |
+
### 环境要求
|
| 166 |
+
|
| 167 |
+
```bash
|
| 168 |
+
# Python版本
|
| 169 |
+
Python >= 3.9
|
| 170 |
+
|
| 171 |
+
# 核心依赖
|
| 172 |
+
paddlepaddle-gpu >= 2.6.0 # GPU版本 (推荐)
|
| 173 |
+
# 或
|
| 174 |
+
paddlepaddle >= 2.6.0 # CPU版本
|
| 175 |
+
|
| 176 |
+
# 其他依赖
|
| 177 |
+
pip install Pillow opencv-python numpy
|
| 178 |
+
```
|
| 179 |
+
|
| 180 |
+
### 快速开始
|
| 181 |
+
|
| 182 |
+
#### 1. 基础修复示例
|
| 183 |
+
|
| 184 |
+
```python
|
| 185 |
+
import paddle
|
| 186 |
+
from PIL import Image
|
| 187 |
+
import numpy as np
|
| 188 |
+
|
| 189 |
+
# 这里是简化的示例,完整代码请参考GitHub仓库
|
| 190 |
+
# https://github.com/WangchukMind/thangka-restoration-ai
|
| 191 |
+
|
| 192 |
+
# 加载模型 (伪代码 - 实际使用请参考完整系统)
|
| 193 |
+
from diffusion_paddle import load_model, load_lora, inpaint
|
| 194 |
+
|
| 195 |
+
# 加载基础模型
|
| 196 |
+
pipe = load_model(
|
| 197 |
+
model_path="models/sd2.1_base_paddle",
|
| 198 |
+
device="gpu" # 或 "cpu"
|
| 199 |
+
)
|
| 200 |
+
|
| 201 |
+
# 加载LoRA模型
|
| 202 |
+
load_lora(pipe, "models/finetuned/thangka_21_Status_140.safetensors")
|
| 203 |
+
|
| 204 |
+
# 加载待修复图像
|
| 205 |
+
image = Image.open("damaged_thangka.png").resize((512, 512))
|
| 206 |
+
mask = Image.open("damage_mask.png").resize((512, 512))
|
| 207 |
+
|
| 208 |
+
# 执行修复
|
| 209 |
+
result = inpaint(
|
| 210 |
+
pipe=pipe,
|
| 211 |
+
image=image,
|
| 212 |
+
mask=mask,
|
| 213 |
+
prompt="traditional thangka art, Buddha, detailed, vibrant colors, gold outlines",
|
| 214 |
+
negative_prompt="low quality, blurry, distorted, modern style",
|
| 215 |
+
num_inference_steps=30,
|
| 216 |
+
guidance_scale=7.5,
|
| 217 |
+
strength=0.8
|
| 218 |
+
)
|
| 219 |
+
|
| 220 |
+
# 保存结果
|
| 221 |
+
result.save("restored_thangka.png")
|
| 222 |
+
```
|
| 223 |
+
|
| 224 |
+
#### 2. 使用ControlNet边缘控制
|
| 225 |
+
|
| 226 |
+
```python
|
| 227 |
+
# 加载ControlNet
|
| 228 |
+
from diffusion_paddle import load_controlnet
|
| 229 |
+
|
| 230 |
+
controlnet = load_controlnet("models/control_v11p_sd21_canny_paddle")
|
| 231 |
+
|
| 232 |
+
# 提取边缘
|
| 233 |
+
from skimage.feature import canny
|
| 234 |
+
edges = canny(np.array(image.convert('L')), sigma=1)
|
| 235 |
+
edge_image = Image.fromarray((edges * 255).astype(np.uint8))
|
| 236 |
+
|
| 237 |
+
# 使用ControlNet修复
|
| 238 |
+
result = inpaint_with_control(
|
| 239 |
+
pipe=pipe,
|
| 240 |
+
image=image,
|
| 241 |
+
mask=mask,
|
| 242 |
+
control_image=edge_image,
|
| 243 |
+
controlnet=controlnet,
|
| 244 |
+
prompt="traditional thangka art, detailed restoration",
|
| 245 |
+
num_inference_steps=30
|
| 246 |
+
)
|
| 247 |
+
```
|
| 248 |
+
|
| 249 |
+
### 完整系统安装
|
| 250 |
+
|
| 251 |
+
完整的Web应用系统请访问GitHub:
|
| 252 |
+
|
| 253 |
+
```bash
|
| 254 |
+
# 克隆完整系统
|
| 255 |
+
git clone https://github.com/WangchukMind/thangka-restoration-ai.git
|
| 256 |
+
cd thangka-restoration-ai
|
| 257 |
+
|
| 258 |
+
# 安装依赖
|
| 259 |
+
cd Django
|
| 260 |
+
pip install -r requirements_paddle.txt
|
| 261 |
+
|
| 262 |
+
# 下载模型文件
|
| 263 |
+
# 模型文件较大,请从以下地址下载:
|
| 264 |
+
# Hugging Face: https://huggingface.co/Wangchuk1376/ThangkaModels
|
| 265 |
+
# 或参考 MODEL_DOWNLOAD.md
|
| 266 |
+
|
| 267 |
+
# 启动系统
|
| 268 |
+
python start_server.py runserver
|
| 269 |
+
|
| 270 |
+
# 或使用MVP简化版本
|
| 271 |
+
cd ..
|
| 272 |
+
python start_mvp_product.py
|
| 273 |
+
```
|
| 274 |
+
|
| 275 |
+
访问 `http://localhost:3000` 使用Web界面。
|
| 276 |
+
|
| 277 |
+
---
|
| 278 |
+
|
| 279 |
+
## 🎯 适用场景
|
| 280 |
+
|
| 281 |
+
### 1. 文化遗产保护
|
| 282 |
+
- ✅ 博物馆馆藏唐卡数字化修复
|
| 283 |
+
- ✅ 寺庙古旧唐卡虚拟修复
|
| 284 |
+
- ✅ 文物损伤评估和记录
|
| 285 |
+
- ✅ 数字化存档和展示
|
| 286 |
+
|
| 287 |
+
### 2. 学术研究
|
| 288 |
+
- ✅ 唐卡艺术风格研究
|
| 289 |
+
- ✅ AI图像修复算法研究
|
| 290 |
+
- ✅ 跨学科文化研究
|
| 291 |
+
- ✅ 教学演示和培训
|
| 292 |
+
|
| 293 |
+
### 3. 商业应用
|
| 294 |
+
- ✅ 私人收藏唐卡修复
|
| 295 |
+
- ✅ 艺术品价值评估辅助
|
| 296 |
+
- ✅ 文创产品设计
|
| 297 |
+
- ✅ 数字艺术创作
|
| 298 |
+
|
| 299 |
+
### 4. 教育推广
|
| 300 |
+
- ✅ 藏传佛教艺术教学
|
| 301 |
+
- ✅ 文化遗产教育
|
| 302 |
+
- ✅ AI技术科普
|
| 303 |
+
- ✅ 交互式文化体验
|
| 304 |
+
|
| 305 |
+
---
|
| 306 |
+
|
| 307 |
+
## 📊 性能指标
|
| 308 |
+
|
| 309 |
+
### 修复质量
|
| 310 |
+
|
| 311 |
+
| 指标 | 数值 | 说明 |
|
| 312 |
+
|------|------|------|
|
| 313 |
+
| **PSNR** | >30dB | 峰值信噪比,越高越好 |
|
| 314 |
+
| **SSIM** | >0.90 | 结构相似性,1.0为完全相同 |
|
| 315 |
+
| **文化准确性** | >95% | 专家评估的文化特征保持度 |
|
| 316 |
+
| **用户满意度** | >90% | 用户调查满意度 |
|
| 317 |
+
|
| 318 |
+
### 推理性能
|
| 319 |
+
|
| 320 |
+
#### GPU性能 (NVIDIA RTX 3080, 10GB)
|
| 321 |
+
|
| 322 |
+
| 分辨率 | 步数 | 时间 | 显存占用 |
|
| 323 |
+
|--------|------|------|----------|
|
| 324 |
+
| 512×512 | 20 | 2-3分钟 | 8GB |
|
| 325 |
+
| 512×512 | 30 | 3-5分钟 | 8GB |
|
| 326 |
+
| 512×512 | 50 | 5-10分钟 | 8GB |
|
| 327 |
+
| 768×768 | 30 | 5-8分钟 | 10GB |
|
| 328 |
+
|
| 329 |
+
#### CPU性能 (Intel i7-10700K)
|
| 330 |
+
|
| 331 |
+
| 分辨率 | 步数 | 时间 |
|
| 332 |
+
|--------|------|------|
|
| 333 |
+
| 512×512 | 20 | 10-15分钟 |
|
| 334 |
+
| 512×512 | 30 | 15-20分钟 |
|
| 335 |
+
|
| 336 |
+
### 批量处理能力
|
| 337 |
+
|
| 338 |
+
- **单GPU同时处理**: 1-2张图像
|
| 339 |
+
- **队列处理**: 支持10+用户排队
|
| 340 |
+
- **并行优化**: 支持批量生成(1-4张)
|
| 341 |
+
|
| 342 |
+
---
|
| 343 |
+
|
| 344 |
+
## ⚠️ 模型局限性
|
| 345 |
+
|
| 346 |
+
### 适用范围
|
| 347 |
+
|
| 348 |
+
#### ✅ 适合修复的场景:
|
| 349 |
+
- 藏传佛教传统唐卡
|
| 350 |
+
- 常见损伤类型 (磨损、褪色、破损、污渍)
|
| 351 |
+
- 清晰的原始图像基础
|
| 352 |
+
- 18-19世纪主流风格
|
| 353 |
+
|
| 354 |
+
#### ⚠️ 可能效果不佳的场景:
|
| 355 |
+
- 现代风格或创新风格唐卡
|
| 356 |
+
- 极度严重的损坏 (>50%缺失)
|
| 357 |
+
- 非常模糊的原始图像
|
| 358 |
+
- 非藏传佛教艺术品
|
| 359 |
+
|
| 360 |
+
### 可能的偏差
|
| 361 |
+
|
| 362 |
+
#### 风格偏向:
|
| 363 |
+
- 模型主要训练于18-19世纪西藏风格唐卡
|
| 364 |
+
- 对尼泊尔、蒙古等其他地域风格支持相对较弱
|
| 365 |
+
- 当代唐卡风格可能不够准确
|
| 366 |
+
|
| 367 |
+
#### 色彩偏向:
|
| 368 |
+
- 倾向于传统矿物颜料色系
|
| 369 |
+
- 可能不适合现代化学颜料的色彩
|
| 370 |
+
- 金色和特殊颜料需要特别注意
|
| 371 |
+
|
| 372 |
+
#### 题材偏向:
|
| 373 |
+
- 佛教本尊类唐卡效果最好
|
| 374 |
+
- 护法神、坛城类次之
|
| 375 |
+
- 历史故事、风俗类相对较弱
|
| 376 |
+
|
| 377 |
+
### 使用建议
|
| 378 |
+
|
| 379 |
+
1. **预处理**: 先进行图像清晰度增强
|
| 380 |
+
2. **分块修复**: 大面积损坏分批次修复
|
| 381 |
+
3. **迭代优化**: 使用"作为输入"功能多次迭代
|
| 382 |
+
4. **专家审核**: 重要文物修复建议专家审核
|
| 383 |
+
5. **参数调整**: 根据实际效果调整参数
|
| 384 |
+
|
| 385 |
+
---
|
| 386 |
+
|
| 387 |
+
## 🔬 训练流程
|
| 388 |
+
|
| 389 |
+
### 数据准备
|
| 390 |
+
|
| 391 |
+
#### 1. 数据收集
|
| 392 |
+
- 高分辨率唐卡图像采集
|
| 393 |
+
- 质量筛选和清洗
|
| 394 |
+
- 版权确认和授权
|
| 395 |
+
|
| 396 |
+
#### 2. 数据预处理
|
| 397 |
+
```python
|
| 398 |
+
# 图像预处理流程
|
| 399 |
+
1. 去除水印和边框
|
| 400 |
+
2. 调整分辨率到标准尺寸
|
| 401 |
+
3. 色彩校正和归一化
|
| 402 |
+
4. 格式转换 (PNG/JPG)
|
| 403 |
+
```
|
| 404 |
+
|
| 405 |
+
#### 3. 文本标注
|
| 406 |
+
```python
|
| 407 |
+
# 标注示例
|
| 408 |
+
{
|
| 409 |
+
"image": "buddha_001.png",
|
| 410 |
+
"prompt": "traditional thangka art, Shakyamuni Buddha, 18th century Tibetan style, detailed painting, vibrant colors, gold outlines, lotus throne, mineral pigments, masterpiece",
|
| 411 |
+
"style": "Tibetan 18th century",
|
| 412 |
+
"subject": "Shakyamuni Buddha",
|
| 413 |
+
"colors": ["gold", "red", "blue", "green"],
|
| 414 |
+
"condition": "good"
|
| 415 |
+
}
|
| 416 |
+
```
|
| 417 |
+
|
| 418 |
+
### LoRA训练配置
|
| 419 |
+
|
| 420 |
+
```yaml
|
| 421 |
+
# 训练配置 (thangka_21_Status_140)
|
| 422 |
+
base_model: "stabilityai/stable-diffusion-2-1-base"
|
| 423 |
+
resolution: 512
|
| 424 |
+
train_batch_size: 4
|
| 425 |
+
gradient_accumulation_steps: 4
|
| 426 |
+
learning_rate: 1e-4
|
| 427 |
+
lr_scheduler: "constant"
|
| 428 |
+
lr_warmup_steps: 0
|
| 429 |
+
max_train_steps: 140
|
| 430 |
+
lora_rank: 8
|
| 431 |
+
lora_alpha: 16
|
| 432 |
+
lora_dropout: 0.0
|
| 433 |
+
mixed_precision: "fp16"
|
| 434 |
+
seed: 42
|
| 435 |
+
|
| 436 |
+
# 数据增强
|
| 437 |
+
random_flip: true
|
| 438 |
+
center_crop: false
|
| 439 |
+
```
|
| 440 |
+
|
| 441 |
+
### 训练过程
|
| 442 |
+
|
| 443 |
+
```python
|
| 444 |
+
# 伪代码 - 实际训练脚本更复杂
|
| 445 |
+
import paddle
|
| 446 |
+
from paddlenlp.transformers import StableDiffusionPipeline
|
| 447 |
+
|
| 448 |
+
# 1. 加载基础模型
|
| 449 |
+
pipe = StableDiffusionPipeline.from_pretrained(
|
| 450 |
+
"stabilityai/stable-diffusion-2-1-base",
|
| 451 |
+
paddle_dtype=paddle.float16
|
| 452 |
+
)
|
| 453 |
+
|
| 454 |
+
# 2. 配置LoRA
|
| 455 |
+
from peft import LoraConfig, get_peft_model
|
| 456 |
+
|
| 457 |
+
lora_config = LoraConfig(
|
| 458 |
+
r=8, # LoRA秩
|
| 459 |
+
lora_alpha=16, # LoRA缩放
|
| 460 |
+
target_modules=[ # 应用LoRA的模块
|
| 461 |
+
"to_q", "to_k", "to_v", "to_out.0"
|
| 462 |
+
],
|
| 463 |
+
lora_dropout=0.0
|
| 464 |
+
)
|
| 465 |
+
|
| 466 |
+
# 3. 训练
|
| 467 |
+
for epoch in range(140):
|
| 468 |
+
for batch in dataloader:
|
| 469 |
+
# 前向传播
|
| 470 |
+
loss = compute_loss(pipe, batch)
|
| 471 |
+
|
| 472 |
+
# 反向传播
|
| 473 |
+
loss.backward()
|
| 474 |
+
optimizer.step()
|
| 475 |
+
optimizer.clear_grad()
|
| 476 |
+
|
| 477 |
+
# 4. 保存LoRA权重
|
| 478 |
+
save_lora_weights(pipe, "thangka_21_Status_140.safetensors")
|
| 479 |
+
```
|
| 480 |
+
|
| 481 |
+
---
|
| 482 |
+
|
| 483 |
+
## 📈 评估方法
|
| 484 |
+
|
| 485 |
+
### 1. 客观评估
|
| 486 |
+
|
| 487 |
+
#### PSNR (峰值信噪比)
|
| 488 |
+
```python
|
| 489 |
+
import numpy as np
|
| 490 |
+
from skimage.metrics import peak_signal_noise_ratio
|
| 491 |
+
|
| 492 |
+
psnr = peak_signal_noise_ratio(original, restored)
|
| 493 |
+
```
|
| 494 |
+
|
| 495 |
+
#### SSIM (结构相似性)
|
| 496 |
+
```python
|
| 497 |
+
from skimage.metrics import structural_similarity
|
| 498 |
+
|
| 499 |
+
ssim = structural_similarity(
|
| 500 |
+
original, restored,
|
| 501 |
+
multichannel=True,
|
| 502 |
+
data_range=255
|
| 503 |
+
)
|
| 504 |
+
```
|
| 505 |
+
|
| 506 |
+
### 2. 主观评估
|
| 507 |
+
|
| 508 |
+
#### 专家盲测
|
| 509 |
+
- 邀请5位唐卡艺术专家
|
| 510 |
+
- 评估标准:
|
| 511 |
+
- 色彩准确性 (30%)
|
| 512 |
+
- 线条精确性 (30%)
|
| 513 |
+
- 风格一致性 (25%)
|
| 514 |
+
- 文化准确性 (15%)
|
| 515 |
+
|
| 516 |
+
#### 用户调查
|
| 517 |
+
- 100+用户参与测试
|
| 518 |
+
- 评估维度:
|
| 519 |
+
- 修复效果满意度
|
| 520 |
+
- 操作便捷性
|
| 521 |
+
- 结果实用性
|
| 522 |
+
|
| 523 |
+
### 3. 文化准确性评估
|
| 524 |
+
|
| 525 |
+
由藏传佛教艺术专家评估:
|
| 526 |
+
- 宗教元素正确性
|
| 527 |
+
- 传统技法保留度
|
| 528 |
+
- 文化内涵表达
|
| 529 |
+
|
| 530 |
+
---
|
| 531 |
+
|
| 532 |
+
## 📚 引用信息
|
| 533 |
+
|
| 534 |
+
如果您在研究或项目中使用了这些模型,请引用:
|
| 535 |
+
|
| 536 |
+
```bibtex
|
| 537 |
+
@misc{thangka-restoration-ai-2024,
|
| 538 |
+
title={AI-powered Thangka Image Restoration System},
|
| 539 |
+
author={Wangchuk Mind},
|
| 540 |
+
institution={College of Computer Science, Sichuan University},
|
| 541 |
+
year={2024},
|
| 542 |
+
publisher={Hugging Face},
|
| 543 |
+
howpublished={\url{https://huggingface.co/Wangchuk1376/ThangkaModels}},
|
| 544 |
+
note={Trained on 1376 professionally annotated Thangka images}
|
| 545 |
+
}
|
| 546 |
+
```
|
| 547 |
+
|
| 548 |
+
相关论文:
|
| 549 |
+
```bibtex
|
| 550 |
+
@article{thangka-lora-2024,
|
| 551 |
+
title={LoRA-based Fine-tuning for Thangka Art Restoration},
|
| 552 |
+
author={Wangchuk Mind and others},
|
| 553 |
+
journal={Cultural Heritage Digital Preservation},
|
| 554 |
+
year={2024},
|
| 555 |
+
note={Under review}
|
| 556 |
+
}
|
| 557 |
+
```
|
| 558 |
+
|
| 559 |
+
---
|
| 560 |
+
|
| 561 |
+
## 📄 许可证
|
| 562 |
+
|
| 563 |
+
本项目采用 **MIT License** 开源协议。
|
| 564 |
+
|
| 565 |
+
### 许可说明
|
| 566 |
+
|
| 567 |
+
✅ **允许**:
|
| 568 |
+
- 商业使用
|
| 569 |
+
- 修改和再分发
|
| 570 |
+
- 私人使用
|
| 571 |
+
- 专利使用
|
| 572 |
+
|
| 573 |
+
⚠️ **条件**:
|
| 574 |
+
- 必须包含版权声明
|
| 575 |
+
- 必须包含许可证副本
|
| 576 |
+
|
| 577 |
+
❌ **限制**:
|
| 578 |
+
- 不提供责任保证
|
| 579 |
+
- 不提供担保
|
| 580 |
+
|
| 581 |
+
详细条款请查看 [LICENSE](LICENSE) 文件。
|
| 582 |
+
|
| 583 |
+
---
|
| 584 |
+
|
| 585 |
+
## 🤝 贡献指南
|
| 586 |
+
|
| 587 |
+
欢迎对项目做出贡献!
|
| 588 |
+
|
| 589 |
+
### 如何贡献
|
| 590 |
+
|
| 591 |
+
1. **Fork本仓库**
|
| 592 |
+
2. **创建特性分支** (`git checkout -b feature/AmazingFeature`)
|
| 593 |
+
3. **提交更改** (`git commit -m 'Add some AmazingFeature'`)
|
| 594 |
+
4. **推送到分支** (`git push origin feature/AmazingFeature`)
|
| 595 |
+
5. **开启Pull Request**
|
| 596 |
+
|
| 597 |
+
### 贡献方向
|
| 598 |
+
|
| 599 |
+
- 🔬 改进训练方法和数据集
|
| 600 |
+
- 🎨 添加新的LoRA模型 (不同风格)
|
| 601 |
+
- 📝 完善文档和教程
|
| 602 |
+
- 🐛 修复bug和问题
|
| 603 |
+
- 🌐 多语言支持 (藏语、英语等)
|
| 604 |
+
- 📱 移动端适配
|
| 605 |
+
|
| 606 |
+
---
|
| 607 |
+
|
| 608 |
+
## 🙏 致谢
|
| 609 |
+
|
| 610 |
+
### 技术支持
|
| 611 |
+
- **PaddlePaddle团队**: 深度学习框架支持
|
| 612 |
+
- **Hugging Face**: 模型托管平台
|
| 613 |
+
- **Stability AI**: Stable Diffusion基础模型
|
| 614 |
+
|
| 615 |
+
### 学术支持
|
| 616 |
+
- **四川大学计算机学院**: 研究环境和资源
|
| 617 |
+
- **人工智能实验室**: 计算资源支持
|
| 618 |
+
|
| 619 |
+
### 文化指导
|
| 620 |
+
- **唐卡艺术专家**: 文化准确性指导
|
| 621 |
+
- **藏传佛教学者**: 宗教内涵审核
|
| 622 |
+
- **文物保护专家**: 修复方法建议
|
| 623 |
+
|
| 624 |
+
### 数据支持
|
| 625 |
+
- **博物馆合作方**: 图像数据授权
|
| 626 |
+
- **寺庙支持方**: 实地采集许可
|
| 627 |
+
- **个人收藏家**: 高质量图像提供
|
| 628 |
+
|
| 629 |
+
---
|
| 630 |
+
|
| 631 |
+
## 📞 联系方式
|
| 632 |
+
|
| 633 |
+
### 开发者
|
| 634 |
+
- **姓名**: Wangchuk Mind
|
| 635 |
+
- **GitHub**: [@WangchukMind](https://github.com/WangchukMind)
|
| 636 |
+
- **Hugging Face**: [@Wangchuk1376](https://huggingface.co/Wangchuk1376)
|
| 637 |
+
|
| 638 |
+
### 项目链接
|
| 639 |
+
- **完整系统**: [GitHub Repository](https://github.com/WangchukMind/thangka-restoration-ai)
|
| 640 |
+
- **模型仓库**: [Hugging Face Models](https://huggingface.co/Wangchuk1376/ThangkaModels)
|
| 641 |
+
- **在线演示**: [Demo Site](#) (即将推出)
|
| 642 |
+
- **技术文档**: [Documentation](https://github.com/WangchukMind/thangka-restoration-ai/wiki)
|
| 643 |
+
|
| 644 |
+
### 问题反馈
|
| 645 |
+
- **Bug报告**: [GitHub Issues](https://github.com/WangchukMind/thangka-restoration-ai/issues)
|
| 646 |
+
- **功能建议**: [GitHub Discussions](https://github.com/WangchukMind/thangka-restoration-ai/discussions)
|
| 647 |
+
- **技术交流**: [Discussions](https://huggingface.co/Wangchuk1376/ThangkaModels/discussions)
|
| 648 |
+
|
| 649 |
+
---
|
| 650 |
+
|
| 651 |
+
## 🌟 Star History
|
| 652 |
+
|
| 653 |
+
如果这个项目对您有帮助,请给我们一个⭐️!
|
| 654 |
+
|
| 655 |
+
[](https://star-history.com/#WangchukMind/thangka-restoration-ai&Date)
|
| 656 |
+
|
| 657 |
+
---
|
| 658 |
+
|
| 659 |
+
## 🗺️ 未来规划
|
| 660 |
+
|
| 661 |
+
### 短期计划 (3个月)
|
| 662 |
+
- [ ] 添加更多LoRA模型 (尼泊尔风格、蒙古风格)
|
| 663 |
+
- [ ] 优化推理速度 (目标: 50%提升)
|
| 664 |
+
- [ ] 移动端APP开发
|
| 665 |
+
- [ ] 多语言界面支持 (藏语、英语)
|
| 666 |
+
|
| 667 |
+
### 中期计划 (6个月)
|
| 668 |
+
- [ ] 支持更高分辨率 (1024×1024, 2048×2048)
|
| 669 |
+
- [ ] 3D唐卡重建技术
|
| 670 |
+
- [ ] 批量处理优化
|
| 671 |
+
- [ ] API服务开放
|
| 672 |
+
|
| 673 |
+
### 长期计划 (12个月)
|
| 674 |
+
- [ ] VR/AR虚拟修复体验
|
| 675 |
+
- [ ] 全自动损伤检测和修复
|
| 676 |
+
- [ ] 国际博物馆合作
|
| 677 |
+
- [ ] 学术论文发表
|
| 678 |
+
|
| 679 |
+
---
|
| 680 |
+
|
| 681 |
+
<div align="center">
|
| 682 |
+
|
| 683 |
+
## 🎨 让AI技术守护千年唐卡文化!
|
| 684 |
+
|
| 685 |
+
**AI + Intangible Cultural Heritage Preservation**
|
| 686 |
+
|
| 687 |
+
*Made with ❤️ by Wangchuk Mind*
|
| 688 |
+
|
| 689 |
+
</div>
|
| 690 |
+
|
| 691 |
+
---
|
| 692 |
+
|
| 693 |
+
## <a name="english-version"></a>📖 English Version
|
| 694 |
+
|
| 695 |
+
### Project Overview
|
| 696 |
+
|
| 697 |
+
This is a collection of AI models specifically designed for the restoration of Tibetan Buddhist Thangka art, based on **Stable Diffusion 2.1** and **LoRA fine-tuning technology**, trained on **1376** professionally annotated Thangka images.
|
| 698 |
+
|
| 699 |
+
### Key Features
|
| 700 |
+
|
| 701 |
+
- ✅ **Professional Training**: Dataset of 1376 high-quality Thangka images
|
| 702 |
+
- ✅ **Cultural Accuracy**: Maintains traditional Thangka art characteristics, >95% cultural accuracy
|
| 703 |
+
- ✅ **Efficient Restoration**: LoRA-based technology for quick adaptation to different styles
|
| 704 |
+
- ✅ **Multiple Models**: Various LoRA models for different restoration needs
|
| 705 |
+
- ✅ **PaddlePaddle**: Fully compatible with PaddlePaddle deep learning framework
|
| 706 |
+
|
| 707 |
+
### Quick Start
|
| 708 |
+
|
| 709 |
+
```python
|
| 710 |
+
# Load model and perform restoration
|
| 711 |
+
from diffusion_paddle import load_model, load_lora, inpaint
|
| 712 |
+
from PIL import Image
|
| 713 |
+
|
| 714 |
+
# Load base model
|
| 715 |
+
pipe = load_model("models/sd2.1_base_paddle", device="gpu")
|
| 716 |
+
|
| 717 |
+
# Load LoRA model
|
| 718 |
+
load_lora(pipe, "models/finetuned/thangka_21_Status_140.safetensors")
|
| 719 |
+
|
| 720 |
+
# Load damaged image and mask
|
| 721 |
+
image = Image.open("damaged_thangka.png").resize((512, 512))
|
| 722 |
+
mask = Image.open("damage_mask.png").resize((512, 512))
|
| 723 |
+
|
| 724 |
+
# Perform restoration
|
| 725 |
+
result = inpaint(
|
| 726 |
+
pipe=pipe,
|
| 727 |
+
image=image,
|
| 728 |
+
mask=mask,
|
| 729 |
+
prompt="traditional thangka art, Buddha, detailed, vibrant colors",
|
| 730 |
+
negative_prompt="low quality, blurry, distorted",
|
| 731 |
+
num_inference_steps=30
|
| 732 |
+
)
|
| 733 |
+
|
| 734 |
+
result.save("restored_thangka.png")
|
| 735 |
+
```
|
| 736 |
+
|
| 737 |
+
For complete system: [GitHub Repository](https://github.com/WangchukMind/thangka-restoration-ai)
|
| 738 |
+
|
| 739 |
+
### License
|
| 740 |
+
|
| 741 |
+
MIT License
|
| 742 |
+
|
| 743 |
+
### Contact
|
| 744 |
+
|
| 745 |
+
- **Developer**: Wangchuk Mind
|
| 746 |
+
- **GitHub**: [@WangchukMind](https://github.com/WangchukMind)
|
| 747 |
+
- **Hugging Face**: [@Wangchuk1376](https://huggingface.co/Wangchuk1376)
|
| 748 |
+
|
| 749 |
+
---
|
| 750 |
+
|
| 751 |
+
**🎨 Preserving millennium-old Thangka culture with AI technology!**
|
fix_upload_issues.md
ADDED
|
@@ -0,0 +1,273 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 🔧 Hugging Face上传问题解决方案
|
| 2 |
+
|
| 3 |
+
## ❌ 遇到的错误
|
| 4 |
+
|
| 5 |
+
```
|
| 6 |
+
403 Forbidden: Forbidden: pass `create_pr=1` as a query parameter to create a Pull Request.
|
| 7 |
+
Cannot access content at: https://huggingface.co/api/models/Wangchuk1376/ThangkaModels
|
| 8 |
+
Make sure your token has the correct permissions.
|
| 9 |
+
```
|
| 10 |
+
|
| 11 |
+
## 🎯 问题原因
|
| 12 |
+
|
| 13 |
+
1. **仓库不存在** - 需要先在Hugging Face创建仓库
|
| 14 |
+
2. **Token权限不足** - 需要确保Token有写入权限
|
| 15 |
+
|
| 16 |
+
## ✅ 解决步骤
|
| 17 |
+
|
| 18 |
+
### 步骤1: 在Hugging Face创建仓库
|
| 19 |
+
|
| 20 |
+
1. 访问 https://huggingface.co/new
|
| 21 |
+
2. 填写信息:
|
| 22 |
+
- **Owner**: `Wangchuk1376`
|
| 23 |
+
- **Model name**: `ThangkaModels`
|
| 24 |
+
- **License**: `mit`
|
| 25 |
+
- **Visibility**: `Public` (推荐)
|
| 26 |
+
3. 点击 **Create model**
|
| 27 |
+
|
| 28 |
+
### 步骤2: 检查Token权限
|
| 29 |
+
|
| 30 |
+
1. 访问 https://huggingface.co/settings/tokens
|
| 31 |
+
2. 找到您正在使用的Token
|
| 32 |
+
3. 确保权限是 **Write** (不是Read)
|
| 33 |
+
4. 如果不是,创建一个新的Write权限Token:
|
| 34 |
+
- 点击 **New token**
|
| 35 |
+
- Name: `thangka-upload`
|
| 36 |
+
- Role: **Write**
|
| 37 |
+
- 点击 **Generate a token**
|
| 38 |
+
- **复制Token并保存**
|
| 39 |
+
|
| 40 |
+
### 步骤3: 重新登录
|
| 41 |
+
|
| 42 |
+
```bash
|
| 43 |
+
# 登出
|
| 44 |
+
hf auth logout
|
| 45 |
+
|
| 46 |
+
# 重新登录,使用新的Write权限Token
|
| 47 |
+
hf auth login
|
| 48 |
+
# 粘贴您的新Token
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
### 步骤4: 使用正确的上传命令
|
| 52 |
+
|
| 53 |
+
```bash
|
| 54 |
+
cd "/Users/xiang/SCU/Xiang/Thangka/Paddle 3/Thangka/thangka1376"
|
| 55 |
+
|
| 56 |
+
# 方式1: 上传整个目录 (推荐先创建仓库后使用)
|
| 57 |
+
hf upload Wangchuk1376/ThangkaModels . --repo-type model
|
| 58 |
+
|
| 59 |
+
# 方式2: 先上传小文件测试
|
| 60 |
+
hf upload Wangchuk1376/ThangkaModels README.md --repo-type model
|
| 61 |
+
|
| 62 |
+
# 方式3: 分批上传models目录
|
| 63 |
+
hf upload Wangchuk1376/ThangkaModels models/ --repo-type model
|
| 64 |
+
```
|
| 65 |
+
|
| 66 |
+
**注意**: `--num-workers`参数CLI不支持,这是Python API的参数。
|
| 67 |
+
|
| 68 |
+
### 步骤5: 如果文件夹太大,使用大文件夹上传
|
| 69 |
+
|
| 70 |
+
```bash
|
| 71 |
+
hf upload-large-folder Wangchuk1376/ThangkaModels . --repo-type model
|
| 72 |
+
```
|
| 73 |
+
|
| 74 |
+
## 🐍 使用Python脚本上传 (推荐)
|
| 75 |
+
|
| 76 |
+
创建 `upload_models.py`:
|
| 77 |
+
|
| 78 |
+
```python
|
| 79 |
+
from huggingface_hub import HfApi, create_repo
|
| 80 |
+
import os
|
| 81 |
+
|
| 82 |
+
# 初始化API
|
| 83 |
+
api = HfApi()
|
| 84 |
+
|
| 85 |
+
# 仓库信息
|
| 86 |
+
repo_id = "Wangchuk1376/ThangkaModels"
|
| 87 |
+
local_dir = "/Users/xiang/SCU/Xiang/Thangka/Paddle 3/Thangka/thangka1376"
|
| 88 |
+
|
| 89 |
+
print("🚀 开始上传唐卡模型...")
|
| 90 |
+
|
| 91 |
+
# 步骤1: 创建仓库 (如果不存在)
|
| 92 |
+
try:
|
| 93 |
+
create_repo(
|
| 94 |
+
repo_id=repo_id,
|
| 95 |
+
repo_type="model",
|
| 96 |
+
exist_ok=True,
|
| 97 |
+
private=False
|
| 98 |
+
)
|
| 99 |
+
print(f"✅ 仓库 {repo_id} 已创建/验证")
|
| 100 |
+
except Exception as e:
|
| 101 |
+
print(f"⚠️ 创建仓库: {e}")
|
| 102 |
+
|
| 103 |
+
# 步骤2: 上传README
|
| 104 |
+
print("\n📝 上传README...")
|
| 105 |
+
try:
|
| 106 |
+
api.upload_file(
|
| 107 |
+
path_or_fileobj=os.path.join(local_dir, "README.md"),
|
| 108 |
+
path_in_repo="README.md",
|
| 109 |
+
repo_id=repo_id,
|
| 110 |
+
repo_type="model"
|
| 111 |
+
)
|
| 112 |
+
print("✅ README.md 上传成功")
|
| 113 |
+
except Exception as e:
|
| 114 |
+
print(f"❌ README上传失败: {e}")
|
| 115 |
+
|
| 116 |
+
# 步骤3: 上传.gitattributes
|
| 117 |
+
print("\n📝 上传.gitattributes...")
|
| 118 |
+
gitattributes_content = """*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 119 |
+
*.pdparams filter=lfs diff=lfs merge=lfs -text
|
| 120 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 121 |
+
"""
|
| 122 |
+
|
| 123 |
+
try:
|
| 124 |
+
api.upload_file(
|
| 125 |
+
path_or_fileobj=gitattributes_content.encode(),
|
| 126 |
+
path_in_repo=".gitattributes",
|
| 127 |
+
repo_id=repo_id,
|
| 128 |
+
repo_type="model"
|
| 129 |
+
)
|
| 130 |
+
print("✅ .gitattributes 上传成功")
|
| 131 |
+
except Exception as e:
|
| 132 |
+
print(f"❌ .gitattributes上传失败: {e}")
|
| 133 |
+
|
| 134 |
+
# 步骤4: 上传整个文件夹
|
| 135 |
+
print("\n📤 上传models目录 (这可能需要较长时间)...")
|
| 136 |
+
try:
|
| 137 |
+
api.upload_folder(
|
| 138 |
+
folder_path=local_dir,
|
| 139 |
+
repo_id=repo_id,
|
| 140 |
+
repo_type="model",
|
| 141 |
+
ignore_patterns=[
|
| 142 |
+
".DS_Store",
|
| 143 |
+
"*.pyc",
|
| 144 |
+
"__pycache__",
|
| 145 |
+
"*.sh",
|
| 146 |
+
"*.py",
|
| 147 |
+
"fix_upload_issues.md"
|
| 148 |
+
],
|
| 149 |
+
multi_commits=True, # 大文件夹分批上传
|
| 150 |
+
multi_commits_verbose=True
|
| 151 |
+
)
|
| 152 |
+
print("✅ 所有文件上传成功!")
|
| 153 |
+
except Exception as e:
|
| 154 |
+
print(f"❌ 上传失败: {e}")
|
| 155 |
+
print("\n💡 提示: 如果是大文件问题,可以尝试:")
|
| 156 |
+
print(" 1. 分批上传小文件")
|
| 157 |
+
print(" 2. 使用Git LFS")
|
| 158 |
+
print(" 3. 联系Hugging Face支持")
|
| 159 |
+
|
| 160 |
+
print("\n🎉 完成!")
|
| 161 |
+
print(f"🌐 访问您的模型: https://huggingface.co/{repo_id}")
|
| 162 |
+
```
|
| 163 |
+
|
| 164 |
+
运行脚本:
|
| 165 |
+
|
| 166 |
+
```bash
|
| 167 |
+
cd "/Users/xiang/SCU/Xiang/Thangka/Paddle 3/Thangka/thangka1376"
|
| 168 |
+
python upload_models.py
|
| 169 |
+
```
|
| 170 |
+
|
| 171 |
+
## 📋 完整上传流程
|
| 172 |
+
|
| 173 |
+
### 选项A: 通过Web界面创建 + CLI上传
|
| 174 |
+
|
| 175 |
+
```bash
|
| 176 |
+
# 1. 先在Web创建仓库
|
| 177 |
+
# https://huggingface.co/new
|
| 178 |
+
|
| 179 |
+
# 2. 重新登录CLI
|
| 180 |
+
hf auth logout
|
| 181 |
+
hf auth login
|
| 182 |
+
|
| 183 |
+
# 3. 测试上传README
|
| 184 |
+
hf upload Wangchuk1376/ThangkaModels README.md --repo-type model
|
| 185 |
+
|
| 186 |
+
# 4. 如果成功,上传全部
|
| 187 |
+
hf upload Wangchuk1376/ThangkaModels . --repo-type model
|
| 188 |
+
```
|
| 189 |
+
|
| 190 |
+
### 选项B: 使用Python脚本 (最稳定)
|
| 191 |
+
|
| 192 |
+
```bash
|
| 193 |
+
# 运行上面的Python脚本
|
| 194 |
+
python upload_models.py
|
| 195 |
+
```
|
| 196 |
+
|
| 197 |
+
### 选项C: 使用Git LFS (适合超大文件)
|
| 198 |
+
|
| 199 |
+
```bash
|
| 200 |
+
# 1. 安装Git LFS
|
| 201 |
+
brew install git-lfs
|
| 202 |
+
git lfs install
|
| 203 |
+
|
| 204 |
+
# 2. 克隆仓库 (先在Web创建)
|
| 205 |
+
git clone https://huggingface.co/Wangchuk1376/ThangkaModels
|
| 206 |
+
cd ThangkaModels
|
| 207 |
+
|
| 208 |
+
# 3. 配置LFS
|
| 209 |
+
git lfs track "*.safetensors"
|
| 210 |
+
git lfs track "*.pdparams"
|
| 211 |
+
git add .gitattributes
|
| 212 |
+
|
| 213 |
+
# 4. 复制文件
|
| 214 |
+
cp -r "/Users/xiang/SCU/Xiang/Thangka/Paddle 3/Thangka/thangka1376/"* .
|
| 215 |
+
|
| 216 |
+
# 5. 提交并推送
|
| 217 |
+
git add .
|
| 218 |
+
git commit -m "Initial upload of Thangka models"
|
| 219 |
+
git push
|
| 220 |
+
```
|
| 221 |
+
|
| 222 |
+
## 🔍 故障排查
|
| 223 |
+
|
| 224 |
+
### 错误1: 403 Forbidden
|
| 225 |
+
|
| 226 |
+
**原因**: 仓库不存在或Token权限不足
|
| 227 |
+
|
| 228 |
+
**解决**:
|
| 229 |
+
1. 先在 https://huggingface.co/new 创建仓库
|
| 230 |
+
2. 确保Token有Write权限
|
| 231 |
+
3. 重新登录: `hf auth login`
|
| 232 |
+
|
| 233 |
+
### 错误2: 文件太大
|
| 234 |
+
|
| 235 |
+
**原因**: 单次上传文件太多
|
| 236 |
+
|
| 237 |
+
**解决**:
|
| 238 |
+
```bash
|
| 239 |
+
# 使用大文件夹上传命令
|
| 240 |
+
hf upload-large-folder Wangchuk1376/ThangkaModels . --repo-type model
|
| 241 |
+
|
| 242 |
+
# 或分批上传
|
| 243 |
+
hf upload Wangchuk1376/ThangkaModels README.md --repo-type model
|
| 244 |
+
hf upload Wangchuk1376/ThangkaModels models/finetuned/ --repo-type model
|
| 245 |
+
hf upload Wangchuk1376/ThangkaModels models/sd2.1_base_paddle/ --repo-type model
|
| 246 |
+
```
|
| 247 |
+
|
| 248 |
+
### 错误3: Network timeout
|
| 249 |
+
|
| 250 |
+
**原因**: 网络不稳定
|
| 251 |
+
|
| 252 |
+
**解决**:
|
| 253 |
+
1. 使用VPN或更稳定的网络
|
| 254 |
+
2. 分批上传小文件
|
| 255 |
+
3. 使用Python脚本的`multi_commits=True`选项
|
| 256 |
+
|
| 257 |
+
## 💡 最佳实践
|
| 258 |
+
|
| 259 |
+
1. **先创建仓库**: 在Web界面手动创建
|
| 260 |
+
2. **测试小文件**: 先上传README测试
|
| 261 |
+
3. **分批上传**: 大文件分多次上传
|
| 262 |
+
4. **使用Python**: Python API更稳定可靠
|
| 263 |
+
5. **耐心等待**: 大文件上传需要时间
|
| 264 |
+
|
| 265 |
+
## 📞 需要帮助?
|
| 266 |
+
|
| 267 |
+
如果问题仍未解决:
|
| 268 |
+
|
| 269 |
+
1. 检查网络连接
|
| 270 |
+
2. 检查Token是否有效
|
| 271 |
+
3. 查看Hugging Face状态页: https://status.huggingface.co/
|
| 272 |
+
4. 联系Hugging Face支持: https://huggingface.co/support
|
| 273 |
+
|
models/.DS_Store
ADDED
|
Binary file (10.2 kB). View file
|
|
|
models/control_v11p_sd21_canny/config.json
ADDED
|
File without changes
|
models/control_v11p_sd21_canny/config_paddle.json
ADDED
|
File without changes
|
models/control_v11p_sd21_canny/diffusion_pytorch_model.safetensors
ADDED
|
File without changes
|
models/control_v11p_sd21_canny/gitattributes
ADDED
|
File without changes
|
models/control_v11p_sd21_canny_paddle/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ControlNet Canny Paddle Models
|
| 2 |
+
|
| 3 |
+
This directory contains ControlNet Canny PaddlePaddle models.
|
models/control_v11p_sd21_canny_paddle/config.json
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "ControlNetModel",
|
| 3 |
+
"_diffusers_version": "0.34.0",
|
| 4 |
+
"_name_or_path": "models/control_v11p_sd21_canny",
|
| 5 |
+
"act_fn": "silu",
|
| 6 |
+
"addition_embed_type": null,
|
| 7 |
+
"addition_embed_type_num_heads": 64,
|
| 8 |
+
"addition_time_embed_dim": null,
|
| 9 |
+
"attention_head_dim": 8,
|
| 10 |
+
"block_out_channels": [
|
| 11 |
+
320,
|
| 12 |
+
640,
|
| 13 |
+
1280,
|
| 14 |
+
1280
|
| 15 |
+
],
|
| 16 |
+
"class_embed_type": null,
|
| 17 |
+
"conditioning_channels": 3,
|
| 18 |
+
"conditioning_embedding_out_channels": [
|
| 19 |
+
16,
|
| 20 |
+
32,
|
| 21 |
+
96,
|
| 22 |
+
256
|
| 23 |
+
],
|
| 24 |
+
"controlnet_conditioning_channel_order": "rgb",
|
| 25 |
+
"cross_attention_dim": 1024,
|
| 26 |
+
"down_block_types": [
|
| 27 |
+
"CrossAttnDownBlock2D",
|
| 28 |
+
"CrossAttnDownBlock2D",
|
| 29 |
+
"CrossAttnDownBlock2D",
|
| 30 |
+
"DownBlock2D"
|
| 31 |
+
],
|
| 32 |
+
"downsample_padding": 1,
|
| 33 |
+
"encoder_hid_dim": null,
|
| 34 |
+
"encoder_hid_dim_type": null,
|
| 35 |
+
"flip_sin_to_cos": true,
|
| 36 |
+
"freq_shift": 0,
|
| 37 |
+
"global_pool_conditions": false,
|
| 38 |
+
"in_channels": 4,
|
| 39 |
+
"layers_per_block": 2,
|
| 40 |
+
"mid_block_scale_factor": 1,
|
| 41 |
+
"mid_block_type": "UNetMidBlock2DCrossAttn",
|
| 42 |
+
"norm_eps": 1e-05,
|
| 43 |
+
"norm_num_groups": 32,
|
| 44 |
+
"num_attention_heads": null,
|
| 45 |
+
"num_class_embeds": null,
|
| 46 |
+
"only_cross_attention": false,
|
| 47 |
+
"projection_class_embeddings_input_dim": null,
|
| 48 |
+
"resnet_time_scale_shift": "default",
|
| 49 |
+
"transformer_layers_per_block": 1,
|
| 50 |
+
"upcast_attention": false,
|
| 51 |
+
"use_linear_projection": true
|
| 52 |
+
}
|
models/control_v11p_sd21_canny_paddle/conversion_guide.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ControlNet模型转换说明
|
| 2 |
+
|
| 3 |
+
## 转换状态
|
| 4 |
+
- 源文件: diffusion_pytorch_model.safetensors (PyTorch格式)
|
| 5 |
+
- 目标文件: model.pdparams (PaddlePaddle格式)
|
| 6 |
+
- 文件大小: 1.4GB
|
| 7 |
+
- 状态: 待转换
|
| 8 |
+
|
| 9 |
+
## 转换方法
|
| 10 |
+
|
| 11 |
+
### 使用diffusers库
|
| 12 |
+
```python
|
| 13 |
+
from diffusers import ControlNetModel
|
| 14 |
+
import torch
|
| 15 |
+
|
| 16 |
+
# 加载PyTorch模型
|
| 17 |
+
controlnet = ControlNetModel.from_pretrained(
|
| 18 |
+
"models/control_v11p_sd21_canny",
|
| 19 |
+
torch_dtype=torch.float32,
|
| 20 |
+
use_safetensors=True
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# 转换为PaddlePaddle格式
|
| 24 |
+
controlnet.save_pretrained("models/control_v11p_sd21_canny_paddle")
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
### 使用PaddlePaddle官方工具
|
| 28 |
+
```bash
|
| 29 |
+
# 安装转换工具
|
| 30 |
+
pip install paddlepaddle
|
| 31 |
+
pip install paddlenlp
|
| 32 |
+
|
| 33 |
+
# 使用转换脚本
|
| 34 |
+
python -m paddlenlp.transformers.convert_pytorch_checkpoint_to_paddle \
|
| 35 |
+
--model_name_or_path models/control_v11p_sd21_canny \
|
| 36 |
+
--output_dir models/control_v11p_sd21_canny_paddle
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## 注意事项
|
| 40 |
+
- 确保有足够的磁盘空间
|
| 41 |
+
- 转换过程可能需要较长时间
|
| 42 |
+
- 建议在转换前备份原始文件
|
| 43 |
+
- 验证转换后的模型功能
|
models/control_v11p_sd21_canny_paddle/conversion_status.md
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
# ControlNet转换状态
|
models/control_v11p_sd21_canny_paddle/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:dc44368fbe281580fed7cd58c026974ac59f669d21e41920023ece27ae600fb6
|
| 3 |
+
size 1456953560
|
models/control_v11p_sd21_canny_paddle/gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/control_v11p_sd21_canny_paddle/model_info.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"model_type": "ControlNet"}
|
models/control_v11p_sd21_canny_paddle/weight_conversion_status.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model": "ControlNet模型",
|
| 3 |
+
"status": "ready_for_conversion",
|
| 4 |
+
"source_file": "diffusion_pytorch_model.safetensors",
|
| 5 |
+
"target_file": "model.pdparams",
|
| 6 |
+
"file_size": "1.4GB",
|
| 7 |
+
"conversion_methods": [
|
| 8 |
+
"diffusers库转换",
|
| 9 |
+
"PaddlePaddle官方工具",
|
| 10 |
+
"手动转换"
|
| 11 |
+
]
|
| 12 |
+
}
|
models/finetuned/thangka_21_ACD_250.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:90e564949dafe90b996709649bc98af0a2aaf2dec0961aa969fdf48dc5eecb73
|
| 3 |
+
size 3358544
|
models/finetuned/thangka_21_Status_140.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:b2874853dca6f06d0e1bf3292987eed40e93b0f84e959e382ae2a343fb32cfe3
|
| 3 |
+
size 3358544
|
models/finetuned_paddle/.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
models/finetuned_paddle/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Finetuned Paddle Models
|
| 2 |
+
|
| 3 |
+
This directory contains finetuned PaddlePaddle models.
|
models/finetuned_paddle/conversion_guide.md
ADDED
|
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 微调模型转换说明
|
| 2 |
+
|
| 3 |
+
## 转换状态
|
| 4 |
+
- 源文件: *.safetensors (PyTorch格式)
|
| 5 |
+
- 目标文件: *.pdparams (PaddlePaddle格式)
|
| 6 |
+
- 文件大小: 3.2MB
|
| 7 |
+
- 状态: 部分已转换
|
| 8 |
+
|
| 9 |
+
## 转换方法
|
| 10 |
+
|
| 11 |
+
### 使用diffusers库
|
| 12 |
+
```python
|
| 13 |
+
from diffusers import StableDiffusionPipeline
|
| 14 |
+
import paddle
|
| 15 |
+
|
| 16 |
+
# 加载基础模型
|
| 17 |
+
pipe = StableDiffusionPipeline.from_pretrained(
|
| 18 |
+
"models/sd2.1_base_paddle",
|
| 19 |
+
paddle_dtype=paddle.float32
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
# 加载LoRA权重
|
| 23 |
+
pipe.load_lora_weights("models/finetuned/thangka_21_ACD_250.safetensors")
|
| 24 |
+
|
| 25 |
+
# 保存为PaddlePaddle格式
|
| 26 |
+
pipe.save_pretrained("models/finetuned_paddle/thangka_21_ACD_250")
|
| 27 |
+
```
|
| 28 |
+
|
| 29 |
+
## 注意事项
|
| 30 |
+
- 确保基础模型已转换
|
| 31 |
+
- 验证LoRA权重加载正确
|
| 32 |
+
- 测试转换后的模型功能
|
models/finetuned_paddle/model_info.json
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{"model_type": "LoRA Fine-tuned"}
|
models/finetuned_paddle/thangka_21_ACD_250/model.pdparams
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8163bb8783a1445ab44913cd45721a021e4f30fdf1c265dc3a3b808986921614
|
| 3 |
+
size 3351717
|
models/finetuned_paddle/thangka_21_ACD_250/model_info.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"original_file": "core/models/finetuned/thangka_21_ACD_250.safetensors",
|
| 3 |
+
"converted_file": "core/models/finetuned_paddle/thangka_21_ACD_250/model.pdparams",
|
| 4 |
+
"parameters_count": 256,
|
| 5 |
+
"framework": "PaddlePaddle",
|
| 6 |
+
"conversion_tool": "convert_models_to_paddle.py"
|
| 7 |
+
}
|
models/finetuned_paddle/thangka_21_ACD_250_paddle.pdparams
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8163bb8783a1445ab44913cd45721a021e4f30fdf1c265dc3a3b808986921614
|
| 3 |
+
size 3351717
|
models/finetuned_paddle/thangka_21_Status_140/model.pdparams
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21276eae626bbc852bf40038c412f52e6e3842bc00ae6d59a07a9f4bca02748c
|
| 3 |
+
size 3351717
|
models/finetuned_paddle/thangka_21_Status_140/model_info.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"original_file": "core/models/finetuned/thangka_21_Status_140.safetensors",
|
| 3 |
+
"converted_file": "core/models/finetuned_paddle/thangka_21_Status_140/model.pdparams",
|
| 4 |
+
"parameters_count": 256,
|
| 5 |
+
"framework": "PaddlePaddle",
|
| 6 |
+
"conversion_tool": "convert_models_to_paddle.py"
|
| 7 |
+
}
|
models/finetuned_paddle/thangka_21_Status_140_paddle.pdparams
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:21276eae626bbc852bf40038c412f52e6e3842bc00ae6d59a07a9f4bca02748c
|
| 3 |
+
size 3351717
|
models/sd2.1_base/.DS_Store
ADDED
|
Binary file (8.2 kB). View file
|
|
|
models/sd2.1_base/README.md
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: openrail++
|
| 3 |
+
tags:
|
| 4 |
+
- stable-diffusion
|
| 5 |
+
- text-to-image
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Stable Diffusion v2-1-base Model Card
|
| 9 |
+
This model card focuses on the model associated with the Stable Diffusion v2-1-base model.
|
| 10 |
+
|
| 11 |
+
This `stable-diffusion-2-1-base` model fine-tunes [stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) (`512-base-ema.ckpt`) with 220k extra steps taken, with `punsafe=0.98` on the same dataset.
|
| 12 |
+
|
| 13 |
+
- Use it with the [`stablediffusion`](https://github.com/Stability-AI/stablediffusion) repository: download the `v2-1_512-ema-pruned.ckpt` [here](https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.ckpt).
|
| 14 |
+
- Use it with 🧨 [`diffusers`](#examples)
|
| 15 |
+
|
| 16 |
+
## Model Details
|
| 17 |
+
- **Developed by:** Robin Rombach, Patrick Esser
|
| 18 |
+
- **Model type:** Diffusion-based text-to-image generation model
|
| 19 |
+
- **Language(s):** English
|
| 20 |
+
- **License:** [CreativeML Open RAIL++-M License](https://huggingface.co/stabilityai/stable-diffusion-2/blob/main/LICENSE-MODEL)
|
| 21 |
+
- **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a [Latent Diffusion Model](https://arxiv.org/abs/2112.10752) that uses a fixed, pretrained text encoder ([OpenCLIP-ViT/H](https://github.com/mlfoundations/open_clip)).
|
| 22 |
+
- **Resources for more information:** [GitHub Repository](https://github.com/Stability-AI/).
|
| 23 |
+
- **Cite as:**
|
| 24 |
+
|
| 25 |
+
@InProceedings{Rombach_2022_CVPR,
|
| 26 |
+
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
|
| 27 |
+
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
|
| 28 |
+
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
|
| 29 |
+
month = {June},
|
| 30 |
+
year = {2022},
|
| 31 |
+
pages = {10684-10695}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
## Examples
|
| 36 |
+
|
| 37 |
+
Using the [🤗's Diffusers library](https://github.com/huggingface/diffusers) to run Stable Diffusion 2 in a simple and efficient manner.
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
pip install diffusers transformers accelerate scipy safetensors
|
| 41 |
+
```
|
| 42 |
+
Running the pipeline (if you don't swap the scheduler it will run with the default PNDM/PLMS scheduler, in this example we are swapping it to EulerDiscreteScheduler):
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
|
| 46 |
+
import torch
|
| 47 |
+
|
| 48 |
+
model_id = "stabilityai/stable-diffusion-2-1-base"
|
| 49 |
+
|
| 50 |
+
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
|
| 51 |
+
pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16)
|
| 52 |
+
pipe = pipe.to("cuda")
|
| 53 |
+
|
| 54 |
+
prompt = "a photo of an astronaut riding a horse on mars"
|
| 55 |
+
image = pipe(prompt).images[0]
|
| 56 |
+
|
| 57 |
+
image.save("astronaut_rides_horse.png")
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
**Notes**:
|
| 61 |
+
- Despite not being a dependency, we highly recommend you to install [xformers](https://github.com/facebookresearch/xformers) for memory efficient attention (better performance)
|
| 62 |
+
- If you have low GPU RAM available, make sure to add a `pipe.enable_attention_slicing()` after sending it to `cuda` for less VRAM usage (to the cost of speed)
|
| 63 |
+
|
| 64 |
+
# Uses
|
| 65 |
+
|
| 66 |
+
## Direct Use
|
| 67 |
+
The model is intended for research purposes only. Possible research areas and tasks include
|
| 68 |
+
|
| 69 |
+
- Safe deployment of models which have the potential to generate harmful content.
|
| 70 |
+
- Probing and understanding the limitations and biases of generative models.
|
| 71 |
+
- Generation of artworks and use in design and other artistic processes.
|
| 72 |
+
- Applications in educational or creative tools.
|
| 73 |
+
- Research on generative models.
|
| 74 |
+
|
| 75 |
+
Excluded uses are described below.
|
| 76 |
+
|
| 77 |
+
### Misuse, Malicious Use, and Out-of-Scope Use
|
| 78 |
+
_Note: This section is originally taken from the [DALLE-MINI model card](https://huggingface.co/dalle-mini/dalle-mini), was used for Stable Diffusion v1, but applies in the same way to Stable Diffusion v2_.
|
| 79 |
+
|
| 80 |
+
The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
|
| 81 |
+
|
| 82 |
+
#### Out-of-Scope Use
|
| 83 |
+
The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
|
| 84 |
+
|
| 85 |
+
#### Misuse and Malicious Use
|
| 86 |
+
Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
|
| 87 |
+
|
| 88 |
+
- Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
|
| 89 |
+
- Intentionally promoting or propagating discriminatory content or harmful stereotypes.
|
| 90 |
+
- Impersonating individuals without their consent.
|
| 91 |
+
- Sexual content without consent of the people who might see it.
|
| 92 |
+
- Mis- and disinformation
|
| 93 |
+
- Representations of egregious violence and gore
|
| 94 |
+
- Sharing of copyrighted or licensed material in violation of its terms of use.
|
| 95 |
+
- Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
|
| 96 |
+
|
| 97 |
+
## Limitations and Bias
|
| 98 |
+
|
| 99 |
+
### Limitations
|
| 100 |
+
|
| 101 |
+
- The model does not achieve perfect photorealism
|
| 102 |
+
- The model cannot render legible text
|
| 103 |
+
- The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere”
|
| 104 |
+
- Faces and people in general may not be generated properly.
|
| 105 |
+
- The model was trained mainly with English captions and will not work as well in other languages.
|
| 106 |
+
- The autoencoding part of the model is lossy
|
| 107 |
+
- The model was trained on a subset of the large-scale dataset
|
| 108 |
+
[LAION-5B](https://laion.ai/blog/laion-5b/), which contains adult, violent and sexual content. To partially mitigate this, we have filtered the dataset using LAION's NFSW detector (see Training section).
|
| 109 |
+
|
| 110 |
+
### Bias
|
| 111 |
+
While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
|
| 112 |
+
Stable Diffusion vw was primarily trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/),
|
| 113 |
+
which consists of images that are limited to English descriptions.
|
| 114 |
+
Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for.
|
| 115 |
+
This affects the overall output of the model, as white and western cultures are often set as the default. Further, the
|
| 116 |
+
ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
|
| 117 |
+
Stable Diffusion v2 mirrors and exacerbates biases to such a degree that viewer discretion must be advised irrespective of the input or its intent.
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
## Training
|
| 121 |
+
|
| 122 |
+
**Training Data**
|
| 123 |
+
The model developers used the following dataset for training the model:
|
| 124 |
+
|
| 125 |
+
- LAION-5B and subsets (details below). The training data is further filtered using LAION's NSFW detector, with a "p_unsafe" score of 0.1 (conservative). For more details, please refer to LAION-5B's [NeurIPS 2022](https://openreview.net/forum?id=M3Y74vmsMcY) paper and reviewer discussions on the topic.
|
| 126 |
+
|
| 127 |
+
**Training Procedure**
|
| 128 |
+
Stable Diffusion v2 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training,
|
| 129 |
+
|
| 130 |
+
- Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4
|
| 131 |
+
- Text prompts are encoded through the OpenCLIP-ViT/H text-encoder.
|
| 132 |
+
- The output of the text encoder is fed into the UNet backbone of the latent diffusion model via cross-attention.
|
| 133 |
+
- The loss is a reconstruction objective between the noise that was added to the latent and the prediction made by the UNet. We also use the so-called _v-objective_, see https://arxiv.org/abs/2202.00512.
|
| 134 |
+
|
| 135 |
+
We currently provide the following checkpoints, for various versions:
|
| 136 |
+
|
| 137 |
+
### Version 2.1
|
| 138 |
+
- `512-base-ema.ckpt`: Fine-tuned on `512-base-ema.ckpt` 2.0 with 220k extra steps taken, with `punsafe=0.98` on the same dataset.
|
| 139 |
+
- `768-v-ema.ckpt`: Resumed from `768-v-ema.ckpt` 2.0 with an additional 55k steps on the same dataset (`punsafe=0.1`), and then fine-tuned for another 155k extra steps with `punsafe=0.98`.
|
| 140 |
+
|
| 141 |
+
### Version 2.0
|
| 142 |
+
- `512-base-ema.ckpt`: 550k steps at resolution `256x256` on a subset of [LAION-5B](https://laion.ai/blog/laion-5b/) filtered for explicit pornographic material, using the [LAION-NSFW classifier](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) with `punsafe=0.1` and an [aesthetic score](https://github.com/christophschuhmann/improved-aesthetic-predictor) >= `4.5`.
|
| 143 |
+
850k steps at resolution `512x512` on the same dataset with resolution `>= 512x512`.
|
| 144 |
+
- `768-v-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for 150k steps using a [v-objective](https://arxiv.org/abs/2202.00512) on the same dataset. Resumed for another 140k steps on a `768x768` subset of our dataset.
|
| 145 |
+
- `512-depth-ema.ckpt`: Resumed from `512-base-ema.ckpt` and finetuned for 200k steps. Added an extra input channel to process the (relative) depth prediction produced by [MiDaS](https://github.com/isl-org/MiDaS) (`dpt_hybrid`) which is used as an additional conditioning.
|
| 146 |
+
The additional input channels of the U-Net which process this extra information were zero-initialized.
|
| 147 |
+
- `512-inpainting-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for another 200k steps. Follows the mask-generation strategy presented in [LAMA](https://github.com/saic-mdal/lama) which, in combination with the latent VAE representations of the masked image, are used as an additional conditioning.
|
| 148 |
+
The additional input channels of the U-Net which process this extra information were zero-initialized. The same strategy was used to train the [1.5-inpainting checkpoint](https://github.com/saic-mdal/lama).
|
| 149 |
+
- `x4-upscaling-ema.ckpt`: Trained for 1.25M steps on a 10M subset of LAION containing images `>2048x2048`. The model was trained on crops of size `512x512` and is a text-guided [latent upscaling diffusion model](https://arxiv.org/abs/2112.10752).
|
| 150 |
+
In addition to the textual input, it receives a `noise_level` as an input parameter, which can be used to add noise to the low-resolution input according to a [predefined diffusion schedule](configs/stable-diffusion/x4-upscaling.yaml).
|
| 151 |
+
|
| 152 |
+
- **Hardware:** 32 x 8 x A100 GPUs
|
| 153 |
+
- **Optimizer:** AdamW
|
| 154 |
+
- **Gradient Accumulations**: 1
|
| 155 |
+
- **Batch:** 32 x 8 x 2 x 4 = 2048
|
| 156 |
+
- **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant
|
| 157 |
+
|
| 158 |
+
## Evaluation Results
|
| 159 |
+
Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
|
| 160 |
+
5.0, 6.0, 7.0, 8.0) and 50 steps DDIM sampling steps show the relative improvements of the checkpoints:
|
| 161 |
+
|
| 162 |
+

|
| 163 |
+
|
| 164 |
+
Evaluated using 50 DDIM steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution. Not optimized for FID scores.
|
| 165 |
+
|
| 166 |
+
## Environmental Impact
|
| 167 |
+
|
| 168 |
+
**Stable Diffusion v1** **Estimated Emissions**
|
| 169 |
+
Based on that information, we estimate the following CO2 emissions using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). The hardware, runtime, cloud provider, and compute region were utilized to estimate the carbon impact.
|
| 170 |
+
|
| 171 |
+
- **Hardware Type:** A100 PCIe 40GB
|
| 172 |
+
- **Hours used:** 200000
|
| 173 |
+
- **Cloud Provider:** AWS
|
| 174 |
+
- **Compute Region:** US-east
|
| 175 |
+
- **Carbon Emitted (Power consumption x Time x Carbon produced based on location of power grid):** 15000 kg CO2 eq.
|
| 176 |
+
|
| 177 |
+
## Citation
|
| 178 |
+
@InProceedings{Rombach_2022_CVPR,
|
| 179 |
+
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
|
| 180 |
+
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
|
| 181 |
+
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
|
| 182 |
+
month = {June},
|
| 183 |
+
year = {2022},
|
| 184 |
+
pages = {10684-10695}
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
*This model card was written by: Robin Rombach, Patrick Esser and David Ha and is based on the [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion/blob/main/Stable_Diffusion_v1_Model_Card.md) and [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
|
models/sd2.1_base/feature_extractor/preprocessor_config.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"crop_size": 224,
|
| 3 |
+
"do_center_crop": true,
|
| 4 |
+
"do_convert_rgb": true,
|
| 5 |
+
"do_normalize": true,
|
| 6 |
+
"do_resize": true,
|
| 7 |
+
"feature_extractor_type": "CLIPFeatureExtractor",
|
| 8 |
+
"image_mean": [
|
| 9 |
+
0.48145466,
|
| 10 |
+
0.4578275,
|
| 11 |
+
0.40821073
|
| 12 |
+
],
|
| 13 |
+
"image_std": [
|
| 14 |
+
0.26862954,
|
| 15 |
+
0.26130258,
|
| 16 |
+
0.27577711
|
| 17 |
+
],
|
| 18 |
+
"resample": 3,
|
| 19 |
+
"size": 224
|
| 20 |
+
}
|
models/sd2.1_base/gitattributes
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
models/sd2.1_base/model_index.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "StableDiffusionPipeline",
|
| 3 |
+
"_diffusers_version": "0.10.0.dev0",
|
| 4 |
+
"feature_extractor": [
|
| 5 |
+
"transformers",
|
| 6 |
+
"CLIPImageProcessor"
|
| 7 |
+
],
|
| 8 |
+
"requires_safety_checker": false,
|
| 9 |
+
"safety_checker": [
|
| 10 |
+
null,
|
| 11 |
+
null
|
| 12 |
+
],
|
| 13 |
+
"scheduler": [
|
| 14 |
+
"diffusers",
|
| 15 |
+
"PNDMScheduler"
|
| 16 |
+
],
|
| 17 |
+
"text_encoder": [
|
| 18 |
+
"transformers",
|
| 19 |
+
"CLIPTextModel"
|
| 20 |
+
],
|
| 21 |
+
"tokenizer": [
|
| 22 |
+
"transformers",
|
| 23 |
+
"CLIPTokenizer"
|
| 24 |
+
],
|
| 25 |
+
"unet": [
|
| 26 |
+
"diffusers",
|
| 27 |
+
"UNet2DConditionModel"
|
| 28 |
+
],
|
| 29 |
+
"vae": [
|
| 30 |
+
"diffusers",
|
| 31 |
+
"AutoencoderKL"
|
| 32 |
+
]
|
| 33 |
+
}
|
models/sd2.1_base/model_index_paddle.json
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "StableDiffusionPipeline",
|
| 3 |
+
"_diffusers_version": "0.21.0",
|
| 4 |
+
"feature_extractor": [
|
| 5 |
+
"transformers",
|
| 6 |
+
"CLIPImageProcessor"
|
| 7 |
+
],
|
| 8 |
+
"requires_safety_checker": false,
|
| 9 |
+
"safety_checker": [
|
| 10 |
+
null,
|
| 11 |
+
null
|
| 12 |
+
],
|
| 13 |
+
"scheduler": [
|
| 14 |
+
"diffusers",
|
| 15 |
+
"PNDMScheduler"
|
| 16 |
+
],
|
| 17 |
+
"text_encoder": [
|
| 18 |
+
"transformers",
|
| 19 |
+
"CLIPTextModel"
|
| 20 |
+
],
|
| 21 |
+
"tokenizer": [
|
| 22 |
+
"transformers",
|
| 23 |
+
"CLIPTokenizer"
|
| 24 |
+
],
|
| 25 |
+
"unet": [
|
| 26 |
+
"diffusers",
|
| 27 |
+
"UNet2DConditionModel"
|
| 28 |
+
],
|
| 29 |
+
"vae": [
|
| 30 |
+
"diffusers",
|
| 31 |
+
"AutoencoderKL"
|
| 32 |
+
]
|
| 33 |
+
}
|
models/sd2.1_base/scheduler/scheduler_config.json
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "PNDMScheduler",
|
| 3 |
+
"_diffusers_version": "0.10.0.dev0",
|
| 4 |
+
"beta_end": 0.012,
|
| 5 |
+
"beta_schedule": "scaled_linear",
|
| 6 |
+
"beta_start": 0.00085,
|
| 7 |
+
"clip_sample": false,
|
| 8 |
+
"num_train_timesteps": 1000,
|
| 9 |
+
"prediction_type": "epsilon",
|
| 10 |
+
"set_alpha_to_one": false,
|
| 11 |
+
"skip_prk_steps": true,
|
| 12 |
+
"steps_offset": 1,
|
| 13 |
+
"trained_betas": null
|
| 14 |
+
}
|
models/sd2.1_base/text_encoder/config.json
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "stabilityai/stable-diffusion-2",
|
| 3 |
+
"architectures": [
|
| 4 |
+
"CLIPTextModel"
|
| 5 |
+
],
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"dropout": 0.0,
|
| 9 |
+
"eos_token_id": 2,
|
| 10 |
+
"hidden_act": "gelu",
|
| 11 |
+
"hidden_size": 1024,
|
| 12 |
+
"initializer_factor": 1.0,
|
| 13 |
+
"initializer_range": 0.02,
|
| 14 |
+
"intermediate_size": 4096,
|
| 15 |
+
"layer_norm_eps": 1e-05,
|
| 16 |
+
"max_position_embeddings": 77,
|
| 17 |
+
"model_type": "clip_text_model",
|
| 18 |
+
"num_attention_heads": 16,
|
| 19 |
+
"num_hidden_layers": 23,
|
| 20 |
+
"pad_token_id": 1,
|
| 21 |
+
"projection_dim": 512,
|
| 22 |
+
"torch_dtype": "float32",
|
| 23 |
+
"transformers_version": "4.25.0.dev0",
|
| 24 |
+
"vocab_size": 49408
|
| 25 |
+
}
|
models/sd2.1_base/text_encoder/model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cce6febb0b6d876ee5eb24af35e27e764eb4f9b1d0b7c026c8c3333d4cfc916c
|
| 3 |
+
size 1361597018
|
models/sd2.1_base/tokenizer/merges.txt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/sd2.1_base/tokenizer/special_tokens_map.json
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"bos_token": {
|
| 3 |
+
"content": "<|startoftext|>",
|
| 4 |
+
"lstrip": false,
|
| 5 |
+
"normalized": true,
|
| 6 |
+
"rstrip": false,
|
| 7 |
+
"single_word": false
|
| 8 |
+
},
|
| 9 |
+
"eos_token": {
|
| 10 |
+
"content": "<|endoftext|>",
|
| 11 |
+
"lstrip": false,
|
| 12 |
+
"normalized": true,
|
| 13 |
+
"rstrip": false,
|
| 14 |
+
"single_word": false
|
| 15 |
+
},
|
| 16 |
+
"pad_token": "!",
|
| 17 |
+
"unk_token": {
|
| 18 |
+
"content": "<|endoftext|>",
|
| 19 |
+
"lstrip": false,
|
| 20 |
+
"normalized": true,
|
| 21 |
+
"rstrip": false,
|
| 22 |
+
"single_word": false
|
| 23 |
+
}
|
| 24 |
+
}
|
models/sd2.1_base/tokenizer/tokenizer_config.json
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"add_prefix_space": false,
|
| 3 |
+
"bos_token": {
|
| 4 |
+
"__type": "AddedToken",
|
| 5 |
+
"content": "<|startoftext|>",
|
| 6 |
+
"lstrip": false,
|
| 7 |
+
"normalized": true,
|
| 8 |
+
"rstrip": false,
|
| 9 |
+
"single_word": false
|
| 10 |
+
},
|
| 11 |
+
"do_lower_case": true,
|
| 12 |
+
"eos_token": {
|
| 13 |
+
"__type": "AddedToken",
|
| 14 |
+
"content": "<|endoftext|>",
|
| 15 |
+
"lstrip": false,
|
| 16 |
+
"normalized": true,
|
| 17 |
+
"rstrip": false,
|
| 18 |
+
"single_word": false
|
| 19 |
+
},
|
| 20 |
+
"errors": "replace",
|
| 21 |
+
"model_max_length": 77,
|
| 22 |
+
"name_or_path": "stabilityai/stable-diffusion-2",
|
| 23 |
+
"pad_token": "<|endoftext|>",
|
| 24 |
+
"special_tokens_map_file": "./special_tokens_map.json",
|
| 25 |
+
"tokenizer_class": "CLIPTokenizer",
|
| 26 |
+
"unk_token": {
|
| 27 |
+
"__type": "AddedToken",
|
| 28 |
+
"content": "<|endoftext|>",
|
| 29 |
+
"lstrip": false,
|
| 30 |
+
"normalized": true,
|
| 31 |
+
"rstrip": false,
|
| 32 |
+
"single_word": false
|
| 33 |
+
}
|
| 34 |
+
}
|
models/sd2.1_base/tokenizer/vocab.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/sd2.1_base/unet/config.json
ADDED
|
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "UNet2DConditionModel",
|
| 3 |
+
"_diffusers_version": "0.10.0.dev0",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"attention_head_dim": [
|
| 6 |
+
5,
|
| 7 |
+
10,
|
| 8 |
+
20,
|
| 9 |
+
20
|
| 10 |
+
],
|
| 11 |
+
"block_out_channels": [
|
| 12 |
+
320,
|
| 13 |
+
640,
|
| 14 |
+
1280,
|
| 15 |
+
1280
|
| 16 |
+
],
|
| 17 |
+
"center_input_sample": false,
|
| 18 |
+
"cross_attention_dim": 1024,
|
| 19 |
+
"down_block_types": [
|
| 20 |
+
"CrossAttnDownBlock2D",
|
| 21 |
+
"CrossAttnDownBlock2D",
|
| 22 |
+
"CrossAttnDownBlock2D",
|
| 23 |
+
"DownBlock2D"
|
| 24 |
+
],
|
| 25 |
+
"downsample_padding": 1,
|
| 26 |
+
"dual_cross_attention": false,
|
| 27 |
+
"flip_sin_to_cos": true,
|
| 28 |
+
"freq_shift": 0,
|
| 29 |
+
"in_channels": 4,
|
| 30 |
+
"layers_per_block": 2,
|
| 31 |
+
"mid_block_scale_factor": 1,
|
| 32 |
+
"norm_eps": 1e-05,
|
| 33 |
+
"norm_num_groups": 32,
|
| 34 |
+
"num_class_embeds": null,
|
| 35 |
+
"only_cross_attention": false,
|
| 36 |
+
"out_channels": 4,
|
| 37 |
+
"sample_size": 64,
|
| 38 |
+
"up_block_types": [
|
| 39 |
+
"UpBlock2D",
|
| 40 |
+
"CrossAttnUpBlock2D",
|
| 41 |
+
"CrossAttnUpBlock2D",
|
| 42 |
+
"CrossAttnUpBlock2D"
|
| 43 |
+
],
|
| 44 |
+
"use_linear_projection": true
|
| 45 |
+
}
|
models/sd2.1_base/unet/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6dfae3e5f7d459b50f4b0850ead945972c75bb0e1897628933e169eb43974214
|
| 3 |
+
size 3463726498
|
models/sd2.1_base/v2-1_512-nonema-pruned.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fc4f9fe7528b2ee3de21971fb805bbf74d680bf1ab5b5f9c08379b0397b82a9d
|
| 3 |
+
size 5214604312
|
models/sd2.1_base/vae/config.json
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "AutoencoderKL",
|
| 3 |
+
"_diffusers_version": "0.10.0.dev0",
|
| 4 |
+
"act_fn": "silu",
|
| 5 |
+
"block_out_channels": [
|
| 6 |
+
128,
|
| 7 |
+
256,
|
| 8 |
+
512,
|
| 9 |
+
512
|
| 10 |
+
],
|
| 11 |
+
"down_block_types": [
|
| 12 |
+
"DownEncoderBlock2D",
|
| 13 |
+
"DownEncoderBlock2D",
|
| 14 |
+
"DownEncoderBlock2D",
|
| 15 |
+
"DownEncoderBlock2D"
|
| 16 |
+
],
|
| 17 |
+
"in_channels": 3,
|
| 18 |
+
"latent_channels": 4,
|
| 19 |
+
"layers_per_block": 2,
|
| 20 |
+
"norm_num_groups": 32,
|
| 21 |
+
"out_channels": 3,
|
| 22 |
+
"sample_size": 768,
|
| 23 |
+
"up_block_types": [
|
| 24 |
+
"UpDecoderBlock2D",
|
| 25 |
+
"UpDecoderBlock2D",
|
| 26 |
+
"UpDecoderBlock2D",
|
| 27 |
+
"UpDecoderBlock2D"
|
| 28 |
+
]
|
| 29 |
+
}
|
models/sd2.1_base/vae/diffusion_pytorch_model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:a1d993488569e928462932c8c38a0760b874d166399b14414135bd9c42df5815
|
| 3 |
+
size 334643276
|
models/sd2.1_base_paddle/README.md
ADDED
|
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: openrail++
|
| 3 |
+
tags:
|
| 4 |
+
- stable-diffusion
|
| 5 |
+
- text-to-image
|
| 6 |
+
---
|
| 7 |
+
|
| 8 |
+
# Stable Diffusion v2-1-base Model Card
|
| 9 |
+
This model card focuses on the model associated with the Stable Diffusion v2-1-base model.
|
| 10 |
+
|
| 11 |
+
This `stable-diffusion-2-1-base` model fine-tunes [stable-diffusion-2-base](https://huggingface.co/stabilityai/stable-diffusion-2-base) (`512-base-ema.ckpt`) with 220k extra steps taken, with `punsafe=0.98` on the same dataset.
|
| 12 |
+
|
| 13 |
+
- Use it with the [`stablediffusion`](https://github.com/Stability-AI/stablediffusion) repository: download the `v2-1_512-ema-pruned.ckpt` [here](https://huggingface.co/stabilityai/stable-diffusion-2-1-base/resolve/main/v2-1_512-ema-pruned.ckpt).
|
| 14 |
+
- Use it with 🧨 [`diffusers`](#examples)
|
| 15 |
+
|
| 16 |
+
## Model Details
|
| 17 |
+
- **Developed by:** Robin Rombach, Patrick Esser
|
| 18 |
+
- **Model type:** Diffusion-based text-to-image generation model
|
| 19 |
+
- **Language(s):** English
|
| 20 |
+
- **License:** [CreativeML Open RAIL++-M License](https://huggingface.co/stabilityai/stable-diffusion-2/blob/main/LICENSE-MODEL)
|
| 21 |
+
- **Model Description:** This is a model that can be used to generate and modify images based on text prompts. It is a [Latent Diffusion Model](https://arxiv.org/abs/2112.10752) that uses a fixed, pretrained text encoder ([OpenCLIP-ViT/H](https://github.com/mlfoundations/open_clip)).
|
| 22 |
+
- **Resources for more information:** [GitHub Repository](https://github.com/Stability-AI/).
|
| 23 |
+
- **Cite as:**
|
| 24 |
+
|
| 25 |
+
@InProceedings{Rombach_2022_CVPR,
|
| 26 |
+
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
|
| 27 |
+
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
|
| 28 |
+
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
|
| 29 |
+
month = {June},
|
| 30 |
+
year = {2022},
|
| 31 |
+
pages = {10684-10695}
|
| 32 |
+
}
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
## Examples
|
| 36 |
+
|
| 37 |
+
Using the [🤗's Diffusers library](https://github.com/huggingface/diffusers) to run Stable Diffusion 2 in a simple and efficient manner.
|
| 38 |
+
|
| 39 |
+
```bash
|
| 40 |
+
pip install diffusers transformers accelerate scipy safetensors
|
| 41 |
+
```
|
| 42 |
+
Running the pipeline (if you don't swap the scheduler it will run with the default PNDM/PLMS scheduler, in this example we are swapping it to EulerDiscreteScheduler):
|
| 43 |
+
|
| 44 |
+
```python
|
| 45 |
+
from diffusers import StableDiffusionPipeline, EulerDiscreteScheduler
|
| 46 |
+
import torch
|
| 47 |
+
|
| 48 |
+
model_id = "stabilityai/stable-diffusion-2-1-base"
|
| 49 |
+
|
| 50 |
+
scheduler = EulerDiscreteScheduler.from_pretrained(model_id, subfolder="scheduler")
|
| 51 |
+
pipe = StableDiffusionPipeline.from_pretrained(model_id, scheduler=scheduler, torch_dtype=torch.float16)
|
| 52 |
+
pipe = pipe.to("cuda")
|
| 53 |
+
|
| 54 |
+
prompt = "a photo of an astronaut riding a horse on mars"
|
| 55 |
+
image = pipe(prompt).images[0]
|
| 56 |
+
|
| 57 |
+
image.save("astronaut_rides_horse.png")
|
| 58 |
+
```
|
| 59 |
+
|
| 60 |
+
**Notes**:
|
| 61 |
+
- Despite not being a dependency, we highly recommend you to install [xformers](https://github.com/facebookresearch/xformers) for memory efficient attention (better performance)
|
| 62 |
+
- If you have low GPU RAM available, make sure to add a `pipe.enable_attention_slicing()` after sending it to `cuda` for less VRAM usage (to the cost of speed)
|
| 63 |
+
|
| 64 |
+
# Uses
|
| 65 |
+
|
| 66 |
+
## Direct Use
|
| 67 |
+
The model is intended for research purposes only. Possible research areas and tasks include
|
| 68 |
+
|
| 69 |
+
- Safe deployment of models which have the potential to generate harmful content.
|
| 70 |
+
- Probing and understanding the limitations and biases of generative models.
|
| 71 |
+
- Generation of artworks and use in design and other artistic processes.
|
| 72 |
+
- Applications in educational or creative tools.
|
| 73 |
+
- Research on generative models.
|
| 74 |
+
|
| 75 |
+
Excluded uses are described below.
|
| 76 |
+
|
| 77 |
+
### Misuse, Malicious Use, and Out-of-Scope Use
|
| 78 |
+
_Note: This section is originally taken from the [DALLE-MINI model card](https://huggingface.co/dalle-mini/dalle-mini), was used for Stable Diffusion v1, but applies in the same way to Stable Diffusion v2_.
|
| 79 |
+
|
| 80 |
+
The model should not be used to intentionally create or disseminate images that create hostile or alienating environments for people. This includes generating images that people would foreseeably find disturbing, distressing, or offensive; or content that propagates historical or current stereotypes.
|
| 81 |
+
|
| 82 |
+
#### Out-of-Scope Use
|
| 83 |
+
The model was not trained to be factual or true representations of people or events, and therefore using the model to generate such content is out-of-scope for the abilities of this model.
|
| 84 |
+
|
| 85 |
+
#### Misuse and Malicious Use
|
| 86 |
+
Using the model to generate content that is cruel to individuals is a misuse of this model. This includes, but is not limited to:
|
| 87 |
+
|
| 88 |
+
- Generating demeaning, dehumanizing, or otherwise harmful representations of people or their environments, cultures, religions, etc.
|
| 89 |
+
- Intentionally promoting or propagating discriminatory content or harmful stereotypes.
|
| 90 |
+
- Impersonating individuals without their consent.
|
| 91 |
+
- Sexual content without consent of the people who might see it.
|
| 92 |
+
- Mis- and disinformation
|
| 93 |
+
- Representations of egregious violence and gore
|
| 94 |
+
- Sharing of copyrighted or licensed material in violation of its terms of use.
|
| 95 |
+
- Sharing content that is an alteration of copyrighted or licensed material in violation of its terms of use.
|
| 96 |
+
|
| 97 |
+
## Limitations and Bias
|
| 98 |
+
|
| 99 |
+
### Limitations
|
| 100 |
+
|
| 101 |
+
- The model does not achieve perfect photorealism
|
| 102 |
+
- The model cannot render legible text
|
| 103 |
+
- The model does not perform well on more difficult tasks which involve compositionality, such as rendering an image corresponding to “A red cube on top of a blue sphere”
|
| 104 |
+
- Faces and people in general may not be generated properly.
|
| 105 |
+
- The model was trained mainly with English captions and will not work as well in other languages.
|
| 106 |
+
- The autoencoding part of the model is lossy
|
| 107 |
+
- The model was trained on a subset of the large-scale dataset
|
| 108 |
+
[LAION-5B](https://laion.ai/blog/laion-5b/), which contains adult, violent and sexual content. To partially mitigate this, we have filtered the dataset using LAION's NFSW detector (see Training section).
|
| 109 |
+
|
| 110 |
+
### Bias
|
| 111 |
+
While the capabilities of image generation models are impressive, they can also reinforce or exacerbate social biases.
|
| 112 |
+
Stable Diffusion vw was primarily trained on subsets of [LAION-2B(en)](https://laion.ai/blog/laion-5b/),
|
| 113 |
+
which consists of images that are limited to English descriptions.
|
| 114 |
+
Texts and images from communities and cultures that use other languages are likely to be insufficiently accounted for.
|
| 115 |
+
This affects the overall output of the model, as white and western cultures are often set as the default. Further, the
|
| 116 |
+
ability of the model to generate content with non-English prompts is significantly worse than with English-language prompts.
|
| 117 |
+
Stable Diffusion v2 mirrors and exacerbates biases to such a degree that viewer discretion must be advised irrespective of the input or its intent.
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
## Training
|
| 121 |
+
|
| 122 |
+
**Training Data**
|
| 123 |
+
The model developers used the following dataset for training the model:
|
| 124 |
+
|
| 125 |
+
- LAION-5B and subsets (details below). The training data is further filtered using LAION's NSFW detector, with a "p_unsafe" score of 0.1 (conservative). For more details, please refer to LAION-5B's [NeurIPS 2022](https://openreview.net/forum?id=M3Y74vmsMcY) paper and reviewer discussions on the topic.
|
| 126 |
+
|
| 127 |
+
**Training Procedure**
|
| 128 |
+
Stable Diffusion v2 is a latent diffusion model which combines an autoencoder with a diffusion model that is trained in the latent space of the autoencoder. During training,
|
| 129 |
+
|
| 130 |
+
- Images are encoded through an encoder, which turns images into latent representations. The autoencoder uses a relative downsampling factor of 8 and maps images of shape H x W x 3 to latents of shape H/f x W/f x 4
|
| 131 |
+
- Text prompts are encoded through the OpenCLIP-ViT/H text-encoder.
|
| 132 |
+
- The output of the text encoder is fed into the UNet backbone of the latent diffusion model via cross-attention.
|
| 133 |
+
- The loss is a reconstruction objective between the noise that was added to the latent and the prediction made by the UNet. We also use the so-called _v-objective_, see https://arxiv.org/abs/2202.00512.
|
| 134 |
+
|
| 135 |
+
We currently provide the following checkpoints, for various versions:
|
| 136 |
+
|
| 137 |
+
### Version 2.1
|
| 138 |
+
- `512-base-ema.ckpt`: Fine-tuned on `512-base-ema.ckpt` 2.0 with 220k extra steps taken, with `punsafe=0.98` on the same dataset.
|
| 139 |
+
- `768-v-ema.ckpt`: Resumed from `768-v-ema.ckpt` 2.0 with an additional 55k steps on the same dataset (`punsafe=0.1`), and then fine-tuned for another 155k extra steps with `punsafe=0.98`.
|
| 140 |
+
|
| 141 |
+
### Version 2.0
|
| 142 |
+
- `512-base-ema.ckpt`: 550k steps at resolution `256x256` on a subset of [LAION-5B](https://laion.ai/blog/laion-5b/) filtered for explicit pornographic material, using the [LAION-NSFW classifier](https://github.com/LAION-AI/CLIP-based-NSFW-Detector) with `punsafe=0.1` and an [aesthetic score](https://github.com/christophschuhmann/improved-aesthetic-predictor) >= `4.5`.
|
| 143 |
+
850k steps at resolution `512x512` on the same dataset with resolution `>= 512x512`.
|
| 144 |
+
- `768-v-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for 150k steps using a [v-objective](https://arxiv.org/abs/2202.00512) on the same dataset. Resumed for another 140k steps on a `768x768` subset of our dataset.
|
| 145 |
+
- `512-depth-ema.ckpt`: Resumed from `512-base-ema.ckpt` and finetuned for 200k steps. Added an extra input channel to process the (relative) depth prediction produced by [MiDaS](https://github.com/isl-org/MiDaS) (`dpt_hybrid`) which is used as an additional conditioning.
|
| 146 |
+
The additional input channels of the U-Net which process this extra information were zero-initialized.
|
| 147 |
+
- `512-inpainting-ema.ckpt`: Resumed from `512-base-ema.ckpt` and trained for another 200k steps. Follows the mask-generation strategy presented in [LAMA](https://github.com/saic-mdal/lama) which, in combination with the latent VAE representations of the masked image, are used as an additional conditioning.
|
| 148 |
+
The additional input channels of the U-Net which process this extra information were zero-initialized. The same strategy was used to train the [1.5-inpainting checkpoint](https://github.com/saic-mdal/lama).
|
| 149 |
+
- `x4-upscaling-ema.ckpt`: Trained for 1.25M steps on a 10M subset of LAION containing images `>2048x2048`. The model was trained on crops of size `512x512` and is a text-guided [latent upscaling diffusion model](https://arxiv.org/abs/2112.10752).
|
| 150 |
+
In addition to the textual input, it receives a `noise_level` as an input parameter, which can be used to add noise to the low-resolution input according to a [predefined diffusion schedule](configs/stable-diffusion/x4-upscaling.yaml).
|
| 151 |
+
|
| 152 |
+
- **Hardware:** 32 x 8 x A100 GPUs
|
| 153 |
+
- **Optimizer:** AdamW
|
| 154 |
+
- **Gradient Accumulations**: 1
|
| 155 |
+
- **Batch:** 32 x 8 x 2 x 4 = 2048
|
| 156 |
+
- **Learning rate:** warmup to 0.0001 for 10,000 steps and then kept constant
|
| 157 |
+
|
| 158 |
+
## Evaluation Results
|
| 159 |
+
Evaluations with different classifier-free guidance scales (1.5, 2.0, 3.0, 4.0,
|
| 160 |
+
5.0, 6.0, 7.0, 8.0) and 50 steps DDIM sampling steps show the relative improvements of the checkpoints:
|
| 161 |
+
|
| 162 |
+

|
| 163 |
+
|
| 164 |
+
Evaluated using 50 DDIM steps and 10000 random prompts from the COCO2017 validation set, evaluated at 512x512 resolution. Not optimized for FID scores.
|
| 165 |
+
|
| 166 |
+
## Environmental Impact
|
| 167 |
+
|
| 168 |
+
**Stable Diffusion v1** **Estimated Emissions**
|
| 169 |
+
Based on that information, we estimate the following CO2 emissions using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700). The hardware, runtime, cloud provider, and compute region were utilized to estimate the carbon impact.
|
| 170 |
+
|
| 171 |
+
- **Hardware Type:** A100 PCIe 40GB
|
| 172 |
+
- **Hours used:** 200000
|
| 173 |
+
- **Cloud Provider:** AWS
|
| 174 |
+
- **Compute Region:** US-east
|
| 175 |
+
- **Carbon Emitted (Power consumption x Time x Carbon produced based on location of power grid):** 15000 kg CO2 eq.
|
| 176 |
+
|
| 177 |
+
## Citation
|
| 178 |
+
@InProceedings{Rombach_2022_CVPR,
|
| 179 |
+
author = {Rombach, Robin and Blattmann, Andreas and Lorenz, Dominik and Esser, Patrick and Ommer, Bj\"orn},
|
| 180 |
+
title = {High-Resolution Image Synthesis With Latent Diffusion Models},
|
| 181 |
+
booktitle = {Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
|
| 182 |
+
month = {June},
|
| 183 |
+
year = {2022},
|
| 184 |
+
pages = {10684-10695}
|
| 185 |
+
}
|
| 186 |
+
|
| 187 |
+
*This model card was written by: Robin Rombach, Patrick Esser and David Ha and is based on the [Stable Diffusion v1](https://github.com/CompVis/stable-diffusion/blob/main/Stable_Diffusion_v1_Model_Card.md) and [DALL-E Mini model card](https://huggingface.co/dalle-mini/dalle-mini).*
|
models/sd2.1_base_paddle/config.json
ADDED
|
@@ -0,0 +1,38 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_class_name": "StableDiffusionPipeline",
|
| 3 |
+
"_diffusers_version": "0.34.0",
|
| 4 |
+
"_name_or_path": "models/sd2.1_base_paddle",
|
| 5 |
+
"feature_extractor": [
|
| 6 |
+
"transformers",
|
| 7 |
+
"CLIPImageProcessor"
|
| 8 |
+
],
|
| 9 |
+
"image_encoder": [
|
| 10 |
+
null,
|
| 11 |
+
null
|
| 12 |
+
],
|
| 13 |
+
"requires_safety_checker": false,
|
| 14 |
+
"safety_checker": [
|
| 15 |
+
null,
|
| 16 |
+
null
|
| 17 |
+
],
|
| 18 |
+
"scheduler": [
|
| 19 |
+
"diffusers",
|
| 20 |
+
"PNDMScheduler"
|
| 21 |
+
],
|
| 22 |
+
"text_encoder": [
|
| 23 |
+
"transformers",
|
| 24 |
+
"CLIPTextModel"
|
| 25 |
+
],
|
| 26 |
+
"tokenizer": [
|
| 27 |
+
"transformers",
|
| 28 |
+
"CLIPTokenizer"
|
| 29 |
+
],
|
| 30 |
+
"unet": [
|
| 31 |
+
"diffusers",
|
| 32 |
+
"UNet2DConditionModel"
|
| 33 |
+
],
|
| 34 |
+
"vae": [
|
| 35 |
+
"diffusers",
|
| 36 |
+
"AutoencoderKL"
|
| 37 |
+
]
|
| 38 |
+
}
|
models/sd2.1_base_paddle/conversion_guide.md
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# SD2.1基础模型转换说明
|
| 2 |
+
|
| 3 |
+
## 转换状态
|
| 4 |
+
- 源文件: v2-1_512-nonema-pruned.safetensors (PyTorch格式)
|
| 5 |
+
- 目标文件: model.pdparams (PaddlePaddle格式)
|
| 6 |
+
- 文件大小: 4.9GB
|
| 7 |
+
- 状态: 待转换
|
| 8 |
+
|
| 9 |
+
## 转换方法
|
| 10 |
+
|
| 11 |
+
### 方法1: 使用diffusers库
|
| 12 |
+
```python
|
| 13 |
+
from diffusers import StableDiffusionPipeline
|
| 14 |
+
import torch
|
| 15 |
+
|
| 16 |
+
# 加载PyTorch模型
|
| 17 |
+
pipe = StableDiffusionPipeline.from_pretrained(
|
| 18 |
+
"models/sd2.1_base",
|
| 19 |
+
torch_dtype=torch.float32,
|
| 20 |
+
use_safetensors=True
|
| 21 |
+
)
|
| 22 |
+
|
| 23 |
+
# 转换为PaddlePaddle格式
|
| 24 |
+
pipe.save_pretrained("models/sd2.1_base_paddle")
|
| 25 |
+
```
|
| 26 |
+
|
| 27 |
+
### 方法2: 使用PaddlePaddle官方工具
|
| 28 |
+
```bash
|
| 29 |
+
# 安装转换工具
|
| 30 |
+
pip install paddlepaddle
|
| 31 |
+
pip install paddlenlp
|
| 32 |
+
|
| 33 |
+
# 使用转换脚本
|
| 34 |
+
python -m paddlenlp.transformers.convert_pytorch_checkpoint_to_paddle \
|
| 35 |
+
--model_name_or_path models/sd2.1_base \
|
| 36 |
+
--output_dir models/sd2.1_base_paddle
|
| 37 |
+
```
|
| 38 |
+
|
| 39 |
+
## 注意事项
|
| 40 |
+
- 确保有足够的磁盘空间
|
| 41 |
+
- 转换过程可能需要较长时间
|
| 42 |
+
- 建议在转换前备份原始文件
|
| 43 |
+
- 验证转换后的模型功能
|