recoilme commited on
Commit
61df958
·
1 Parent(s): 1b8b52e
.ipynb_checkpoints/test-checkpoint.ipynb DELETED
@@ -1,95 +0,0 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "4f62bfd9-5396-48e2-aac7-bdf639cab345",
7
- "metadata": {},
8
- "outputs": [
9
- {
10
- "name": "stderr",
11
- "output_type": "stream",
12
- "text": [
13
- "The config attributes {'block_out_channels': [128, 256, 512, 768, 768], 'force_upcast': False} were passed to AsymmetricAutoencoderKL, but are not expected and will be ignored. Please verify your config.json configuration file.\n"
14
- ]
15
- },
16
- {
17
- "name": "stdout",
18
- "output_type": "stream",
19
- "text": [
20
- "ok\n"
21
- ]
22
- }
23
- ],
24
- "source": [
25
- "import torch\n",
26
- "\n",
27
- "from torchvision import transforms, utils\n",
28
- "\n",
29
- "import diffusers\n",
30
- "from diffusers import AsymmetricAutoencoderKL\n",
31
- "\n",
32
- "from diffusers.utils import load_image\n",
33
- "\n",
34
- "def crop_image_to_nearest_divisible_by_8(img):\n",
35
- " # Check if the image height and width are divisible by 8\n",
36
- " if img.shape[1] % 8 == 0 and img.shape[2] % 8 == 0:\n",
37
- " return img\n",
38
- " else:\n",
39
- " # Calculate the closest lower resolution divisible by 8\n",
40
- " new_height = img.shape[1] - (img.shape[1] % 8)\n",
41
- " new_width = img.shape[2] - (img.shape[2] % 8)\n",
42
- " \n",
43
- " # Use CenterCrop to crop the image\n",
44
- " transform = transforms.CenterCrop((new_height, new_width), interpolation=transforms.InterpolationMode.BILINEAR)\n",
45
- " img = transform(img).to(torch.float32).clamp(-1, 1)\n",
46
- " \n",
47
- " return img\n",
48
- " \n",
49
- "to_tensor = transforms.ToTensor()\n",
50
- "\n",
51
- "device = \"cuda\"\n",
52
- "dtype=torch.float16\n",
53
- "vae = AsymmetricAutoencoderKL.from_pretrained(\"vae\",torch_dtype=dtype).to(device).eval()\n",
54
- "\n",
55
- "image = load_image(\"generated.png\")\n",
56
- "\n",
57
- "image = crop_image_to_nearest_divisible_by_8(to_tensor(image)).unsqueeze(0).to(device,dtype=dtype)\n",
58
- "\n",
59
- "upscaled_image = vae(image).sample\n",
60
- "# Save the reconstructed image\n",
61
- "utils.save_image(upscaled_image, \"test.png\")\n",
62
- "print('ok')"
63
- ]
64
- },
65
- {
66
- "cell_type": "code",
67
- "execution_count": null,
68
- "id": "7e3ad326-c410-44b6-a738-15b7f7e15075",
69
- "metadata": {},
70
- "outputs": [],
71
- "source": []
72
- }
73
- ],
74
- "metadata": {
75
- "kernelspec": {
76
- "display_name": "Python 3 (ipykernel)",
77
- "language": "python",
78
- "name": "python3"
79
- },
80
- "language_info": {
81
- "codemirror_mode": {
82
- "name": "ipython",
83
- "version": 3
84
- },
85
- "file_extension": ".py",
86
- "mimetype": "text/x-python",
87
- "name": "python",
88
- "nbconvert_exporter": "python",
89
- "pygments_lexer": "ipython3",
90
- "version": "3.11.6"
91
- }
92
- },
93
- "nbformat": 4,
94
- "nbformat_minor": 5
95
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ === Eval ===
2
+ ```
3
+ SD15 VAE | MSE=2.732e-03 PSNR=28.10 LPIPS=0.147 Edge=0.206 KL=19.821 | Z[min/mean/max/std]=[-17.375, 0.072, 16.203, 0.900] | Skew[min/mean/max]=[-0.543, -0.126, 0.070] | Kurt[min/mean/max]=[-0.151, 1.228, 4.574]
4
+ SDXL VAE fp16 fix | MSE=2.018e-03 PSNR=29.67 LPIPS=0.124 Edge=0.188 KL=32.222 | Z[min/mean/max/std]=[-4.066, -0.014, 4.301, 0.861] | Skew[min/mean/max]=[-0.017, 0.105, 0.165] | Kurt[min/mean/max]=[-0.380, -0.228, -0.107]
5
+ AiArtLab/sdxl_vae | MSE=1.736e-03 PSNR=30.29 LPIPS=0.116 Edge=0.181 KL=32.222 | Z[min/mean/max/std]=[-4.066, -0.014, 4.301, 0.861] | Skew[min/mean/max]=[-0.017, 0.105, 0.165] | Kurt[min/mean/max]=[-0.380, -0.228, -0.107]
6
+ LTX-Video VAE | MSE=1.202e-03 PSNR=31.84 LPIPS=0.141 Edge=0.168 KL=6.656 | Z[min/mean/max/std]=[-5.043, 0.011, 4.969, 0.272] | Skew[min/mean/max]=[-0.542, -0.018, 0.411] | Kurt[min/mean/max]=[-0.576, 0.741, 1.843]
7
+ Wan2.2-TI2V-5B | MSE=7.782e-04 PSNR=34.25 LPIPS=0.052 Edge=0.121 KL=9.472 | Z[min/mean/max/std]=[-4.789, -0.012, 4.266, 0.375] | Skew[min/mean/max]=[-0.397, 0.022, 0.653] | Kurt[min/mean/max]=[-0.482, 0.006, 0.538]
8
+ AiArtLab/wan16x_vae | MSE=7.275e-04 PSNR=34.51 LPIPS=0.051 Edge=0.118 KL=9.472 | Z[min/mean/max/std]=[-4.789, -0.012, 4.266, 0.375] | Skew[min/mean/max]=[-0.397, 0.022, 0.653] | Kurt[min/mean/max]=[-0.482, 0.006, 0.538]
9
+ Wan2.2-T2V-A14B | MSE=7.073e-04 PSNR=34.59 LPIPS=0.048 Edge=0.115 KL=7.781 | Z[min/mean/max/std]=[-15.336, -0.159, 17.703, 2.563] | Skew[min/mean/max]=[-0.343, 0.006, 0.367] | Kurt[min/mean/max]=[-0.538, -0.071, 0.594]
10
+ QwenImage | MSE=6.549e-04 PSNR=35.21 LPIPS=0.047 Edge=0.110 KL=7.776 | Z[min/mean/max/std]=[-15.297, -0.158, 17.688, 2.561] | Skew[min/mean/max]=[-0.346, 0.005, 0.368] | Kurt[min/mean/max]=[-0.538, -0.072, 0.597]
11
+ AuraDiffusion/16ch-vae | MSE=5.361e-04 PSNR=35.80 LPIPS=0.041 Edge=0.100 KL=4.421 | Z[min/mean/max/std]=[-1.373, -0.005, 1.621, 0.165] | Skew[min/mean/max]=[-0.331, 0.040, 0.413] | Kurt[min/mean/max]=[-0.170, 0.303, 0.670]
12
+ FLUX.1-schnell VAE | MSE=4.594e-04 PSNR=35.87 LPIPS=0.035 Edge=0.088 KL=13.016 | Z[min/mean/max/std]=[-5.824, -0.076, 6.246, 0.945] | Skew[min/mean/max]=[-0.268, 0.048, 0.483] | Kurt[min/mean/max]=[-0.498, 0.037, 0.568]
13
+ AiArtLab/simplevae | MSE=4.818e-04 PSNR=36.20 LPIPS=0.035 Edge=0.095 KL=4.032 | Z[min/mean/max/std]=[-7.762, -0.061, 9.914, 0.965] | Skew[min/mean/max]=[-0.320, 0.044, 0.411] | Kurt[min/mean/max]=[-0.045, 0.346, 0.696]
14
+ ```
15
+ === Percent ===
16
+ ```
17
+ | Model | PSNR | LPIPS | Edge |
18
+ |----------------------------|-----------|-----------|-----------|
19
+ | SD15 VAE | 100% | 100% | 100% |
20
+ | SDXL VAE fp16 fix | 105.6% | 118.3% | 109.7% |
21
+ | AiArtLab/sdxl_vae | 107.8% | 126.8% | 113.8% |
22
+ | LTX-Video VAE | 113.3% | 103.8% | 122.5% |
23
+ | Wan2.2-TI2V-5B | 121.9% | 280.8% | 170.8% |
24
+ | AiArtLab/wan16x_vae | 122.8% | 287.3% | 174.2% |
25
+ | Wan2.2-T2V-A14B | 123.1% | 303.2% | 179.4% |
26
+ | QwenImage | 125.3% | 308.8% | 188.0% |
27
+ | AuraDiffusion/16ch-vae | 127.4% | 355.5% | 206.6% |
28
+ | FLUX.1-schnell VAE | 127.6% | 424.4% | 234.8% |
29
+ | AiArtLab/simplevae | 128.8% | 415.2% | 217.7% |
30
+ ```