Kemsekov
/

vqvae2-pixelart

Model card Files Files and versions

xet

Community

Kemsekov commited on Mar 25, 2025

Commit

2ee3330

verified ·

1 Parent(s): 3164379

Update README.md

Browse files

Files changed (1) hide show

README.md +23 -27

README.md CHANGED Viewed

@@ -27,21 +27,23 @@ from matplotlib import pyplot as plt
 import torch
 import torchvision.transforms as T
-sample = PIL.Image.open("sample_images/cat.png") # you sample image
 sample = T.ToTensor()(sample)[None,:] # add batch dimension
-sample = T.Resize((512,512))(sample) # optional, this vqvae works fine with any input image size
-vqvae=torch.jit.load("model.pt")
-# rec is reconstruction
-# z is list of latent space tensors
-# z_q is quantized list of latent space tensors
-# ind is list of encoded indices of quantized elements in latent space
-rec, z, z_q,ind = vqvae.eval().cpu()(sample)
-rec_ind = vqvae.decode_from_ind(ind)
-rec=rec.sigmoid()
-rec_ind=rec_ind.sigmoid()
 print("Original image shape",list(sample.shape[1:]))
 print("ind shapes",[list(v.shape[1:]) for v in ind])
@@ -52,6 +54,7 @@ plt.imshow(T.ToPILImage()(sample[0]).resize((256,256)))
 plt.title("original")
 plt.axis('off')
 plt.subplot(1,3,2)
 plt.imshow(T.ToPILImage()(rec[0]).resize((256,256)))
 plt.title("reconstruction")
@@ -64,6 +67,7 @@ plt.title("reconstruction from ind")
 plt.axis('off')
 plt.show()
 plt.figure(figsize=(18,6))
 plt.subplot(1,3,1)
 plt.imshow(T.ToPILImage()(ind[0]/512).resize((256,256)))
@@ -82,7 +86,7 @@ plt.axis('off')
 plt.show()
 print("latent space render")
-for z_ in z:
     dims = len(z_[0])
     dims_sqrt = int(dims**0.5)
     plt.figure(figsize=(10,10))
@@ -98,20 +102,12 @@ for z_ in z:
 ```
 ```
-Original image shape [3, 512, 512]
-ind shapes [[128, 128], [64, 64], [32, 32]]
 ```
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/d3PSfPu9tkKZkdMv8UJSV.png)
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/pDOPnZtAh05UXfkFaklkq.png)
-And it have following latent space
-Bottom
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/RkRVxY6uly59c8yumMTpv.png)
-Mid
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/CwR8o--prVLmR6TdL4Jt7.png)
-Top
-![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/uF95lUigW-NOYIV2EhD8h.png)
-As you can see, it properly handles different image aspects at different scales

 import torch
 import torchvision.transforms as T
+sample = PIL.Image.open("image.png") # you sample image
 sample = T.ToTensor()(sample)[None,:] # add batch dimension
+sample = T.RandomCrop((256,256))(sample) # this vqvae works fine with any input image size that is divisible by 8
+vqvae=torch.jit.load("model_v3.pt")
+# rec, rec_ind is reconstructions
+# rec is reconstruction from latent space values z
+# rec_ind is reconstruction from model predicted vector indices
+# z latent space tensor with 64 channels and 4x smaller than input image
+# z_layers is list of latent space tensors at different scales
+# z_q_layers is quantized list of latent space tensors
+# ind is list of encoded indices of quantized elements in latent space for each scale
+z, z_layers,z_q_layers, ind = vqvae.encode(sample)
+rec_ind = vqvae.decode_from_ind(ind).sigmoid()
+rec = vqvae.decode(z).sigmoid()
 print("Original image shape",list(sample.shape[1:]))
 print("ind shapes",[list(v.shape[1:]) for v in ind])
 plt.title("original")
 plt.axis('off')
+# these two must look the same
 plt.subplot(1,3,2)
 plt.imshow(T.ToPILImage()(rec[0]).resize((256,256)))
 plt.title("reconstruction")
 plt.axis('off')
 plt.show()
+# this must look like a pile of mess
 plt.figure(figsize=(18,6))
 plt.subplot(1,3,1)
 plt.imshow(T.ToPILImage()(ind[0]/512).resize((256,256)))
 plt.show()
 print("latent space render")
+for z_ in z_layers:
     dims = len(z_[0])
     dims_sqrt = int(dims**0.5)
     plt.figure(figsize=(10,10))
 ```
 ```
+Original image shape [3, 256, 256]
+ind shapes [[64, 64], [32, 32], [16, 16]]
 ```
+Here is some examples at 256x256 resolution
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/-EEovEr-dxpp03YIloWSJ.png)
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/fPrS1L-aBN9yMYaTBjhUa.png)
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/jx4B0NfChsr4AzDh8XWl3.png)
+![image/png](https://cdn-uploads.huggingface.co/production/uploads/633b160acbdbadd99c094172/01Lsf-Zj_U4ULdMNnjGIj.png)