Upload folder using huggingface_hub

Browse files

Files changed (13) hide show

1080p/decoder_1080p_cs_discrete8_wan_patch2.onnx +3 -0
1080p/encoder_1080p_cs_discrete8_wan_patch2.onnx +3 -0
1080p/quantizer_1080p_cs_discrete8_wan_patch2.onnx +3 -0
540p/decoder_540p_cs_discrete8_wan_patch2.onnx +3 -0
540p/encoder_540p_cs_discrete8_wan_patch2.onnx +3 -0
540p/quantizer_540p_cs_discrete8_wan_patch2.onnx +3 -0
720p/decoder_720p_cs_discrete8_wan_patch2.onnx +3 -0
720p/encoder_720p_cs_discrete8_wan_patch2.onnx +3 -0
720p/quantizer_720p_cs_discrete8_wan_patch2.onnx +3 -0
best_model.pth +3 -0
config.json +22 -25
python/simple_sample_vae.py +145 -27
specs.txt +12 -8

1080p/decoder_1080p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:30db9e31f490cc9a7487f9f50b076eb6c75a03ee4cebbd76f0c131df577e9764
+size 27441527

1080p/encoder_1080p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9811f1df9b8aed0fcff820e833b2a70cec95b89db99a1e0c578d79d5a2fc6af9
+size 23172295

1080p/quantizer_1080p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a8c5fef2505d1854984c1560e6485b06e083328ab9b159a0ac245ccca8dbfbf7
+size 8633

540p/decoder_540p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:145e78ddfbce5357170e48ebf118cd5b546f97e67c5b25d7116893dc825e8a79
+size 27441525

540p/encoder_540p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:6710c84c362d067cb780ee7906a13272f8ae37cd1342dc26b3f0209cbd5df123
+size 23172295

540p/quantizer_540p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96120f29521563219267edd904d3166cf113a8d726e9e38362e70ef962222e0a
+size 8629

720p/decoder_720p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:a80c5899556f517fbe4203d5000c17ec7ee9be9609a5430a588da1919cc9b17b
+size 27441526

720p/encoder_720p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1f083d0b791983d4cb5fc918cff6d49bc01433cb15174ff8c811adaef574377a
+size 23172295

720p/quantizer_720p_cs_discrete8_wan_patch2.onnx ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96126464259e93be1cf0c6d034ca73a1057b4563dc9f57cc3fa177a2573597d0
+size 8631

best_model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:fa319aa3982c258ee22d2f9096668b9cd42e0f3104b3a37488232edc01776ac0
+size 50493554

config.json CHANGED Viewed

@@ -1,31 +1,28 @@
 {
-    "project_name": "wan8_csquant_bs8_20251022_134201",
     "teacher_config": {
-        "dim": "96",
-        "z_dim": "16",
-        "dim_mult": "[1, 2, 4, 4]",
-        "num_res_blocks": "2",
-        "attn_scales": "[]",
-        "temporal_downsample": "[False, True, True]",
-        "dropout": "0.0",
-        "cls": "<class 'models.temporal_wan.WanVAE_'>"
-    },
     "student_config": {
-        "dim": "64",
-        "z_dim": "16",
-        "dim_mult": "[1, 2, 4, 4]",
-        "num_res_blocks": "2",
-        "attn_scales": "[]",
-        "dropout": "0.0",
-        "cls": "<class 'models.image_vae.DiscreteImageVAE'>",
-        "z_channels": "256",
-        "z_factor": "1",
-        "embedding_dim": "16",
-        "levels": "[8, 8, 8, 5, 5, 5]",
-        "dtype": "torch.float32",
-        "model_type": "wan_2_1",
-        "quantizer_cls": "<class 'models.quantizers.ChannelSplitFSQ'>",
-        "num_codebooks": "1",
         "K": "2"
     }
 }

 {
+    "project_name": "wan8_csquant_patch2_bs8_720p",
     "teacher_config": {
+        "dim": "96",
+        "z_dim": "16",
+        "dim_mult": "[1, 2, 4, 4]",
+        "num_res_blocks": "2", "attn_scales": "[]", "temperal_downsample": "[False, True, True]", "dropout": "0.0", "cls": "<class 'models.temporal_wan.WanVAE_'>"
+    },
     "student_config": {
+        "dim": "64",
+        "z_dim": "16",
+        "dim_mult": "[1, 2, 4]",
+        "patch_size": "2",
+        "num_res_blocks": "3",
+        "attn_scales": "[]",
+        "dropout": "0.0",
+        "cls": "<class 'models.image_vae.DiscreteImageVAE'>",
+        "z_channels": "256",
+        "z_factor": "1",
+        "embedding_dim": "16",
+        "levels": "[8, 8, 8, 5, 5, 5]",
+        "dtype": "torch.float32",
+        "model_type": "wan_2_1",
+        "quantizer_cls": "<class 'models.quantizers.ChannelSplitFSQ'>",
+        "num_codebooks": "1",
         "K": "2"
     }
 }

python/simple_sample_vae.py CHANGED Viewed

@@ -8,6 +8,40 @@ from einops import rearrange, pack, unpack
 _PERSISTENT = True
 def exists(v):
     return v is not None
@@ -239,20 +273,26 @@ class RMS_norm(nn.Module):
         self.gamma = nn.Parameter(torch.ones(shape))
         self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
     def forward(self, x):
-        return (
-            F.normalize(x, dim=(1 if self.channel_first else -1))
-            * self.scale
-            * self.gamma
-            + self.bias
-        )
 class Upsample(nn.Upsample):
     def forward(self, x):
         # Fix bfloat16 support for nearest neighbor interpolation.
-        return super().forward(x.float()).type_as(x)
 class ResidualBlock2d(nn.Module):
@@ -291,21 +331,77 @@ class AttentionBlock2d(nn.Module):
         self.proj = nn.Conv2d(dim, dim, 1)
         nn.init.zeros_(self.proj.weight)
     def forward(self, x):
         identity = x
         b, c, h, w = x.size()
         x = self.norm(x)
-        q, k, v = (
-            self.to_qkv(x)
-            .reshape(b, 1, c * 3, -1)
-            .permute(0, 1, 3, 2)
-            .contiguous()
-            .chunk(3, dim=-1)
-        )
-        x = F.scaled_dot_product_attention(q, k, v)
-        x = x.squeeze(1).permute(0, 2, 1).reshape(b, c, h, w)
-        x = self.proj(x)
-        return x + identity
 class Resample2d(nn.Module):
@@ -344,6 +440,7 @@ class Encoder2d(nn.Module):
         attn_scales=[],
         patch_size=1,
         in_channels=3,
     ):
         super().__init__()
         self.dim = dim
@@ -354,6 +451,8 @@ class Encoder2d(nn.Module):
         self.patch_size = patch_size
         self.in_channels = in_channels
         # dimensions
         dims = [dim * u for u in [1] + dim_mult]
         scale = 1.0
@@ -370,7 +469,7 @@ class Encoder2d(nn.Module):
             for _ in range(num_res_blocks):
                 downsamples.append(ResidualBlock2d(in_dim, out_dim, dropout))
                 if scale in self.attn_scales:
-                    downsamples.append(AttentionBlock2d(out_dim))
                 in_dim = out_dim
             if i != len(dim_mult) - 1:
                 downsamples.append(Resample2d(out_dim, mode="downsample2d"))
@@ -386,6 +485,7 @@ class Encoder2d(nn.Module):
         )
     def forward(self, x):
         x = self.conv1(x)
         x = self.downsamples(x)
         x = self.middle(x)
@@ -404,6 +504,8 @@ class Decoder2d(nn.Module):
         dropout=0.0,
         attn_scales=[],
         out_channels=3,
     ):
         super().__init__()
         self.dim = dim
@@ -412,12 +514,15 @@ class Decoder2d(nn.Module):
         self.num_res_blocks = num_res_blocks
         self.attn_scales = attn_scales
         self.out_channels = out_channels
         # dimensions (mirror of encoder)
         base = dim * dim_mult[-1]
         dims = [base] + [dim * u for u in dim_mult[::-1]]
         scale = 1.0 / (2 ** (len(dim_mult) - 2)) if len(dim_mult) >= 2 else 1.0
-        output_channels = self.out_channels
         # init block
         self.conv1 = nn.Conv2d(z_dim, dims[0], kernel_size=3, padding=1)
@@ -432,7 +537,7 @@ class Decoder2d(nn.Module):
             for _ in range(num_res_blocks):
                 upsamples.append(ResidualBlock2d(in_dim, out_dim, dropout))
                 if scale in self.attn_scales:
-                    upsamples.append(AttentionBlock2d(out_dim))
                 in_dim = out_dim
             if i != len(dim_mult) - 1:
                 upsamples.append(Resample2d(out_dim, mode="upsample2d"))
@@ -451,6 +556,7 @@ class Decoder2d(nn.Module):
         x = self.middle(x)
         x = self.upsamples(x)
         x = self.head(x)
         return x
@@ -468,6 +574,8 @@ class DiscreteImageVAE(nn.Module):
         out_channels=3,
         embedding_dim=128,
         scale=None,
         *args,
         **kwargs,
     ):
@@ -486,6 +594,8 @@ class DiscreteImageVAE(nn.Module):
             dropout=dropout,
             attn_scales=attn_scales,
             in_channels=in_channels,
         )
         self.decoder = Decoder2d(
             dim=dim,
@@ -495,6 +605,8 @@ class DiscreteImageVAE(nn.Module):
             dropout=dropout,
             attn_scales=attn_scales,
             out_channels=out_channels,
         )
         self.embedding_dim = embedding_dim
@@ -598,7 +710,7 @@ if __name__ == "__main__":
     from PIL import Image
     import numpy as np
-    def load_image(path, size=(848, 480)):
         if not os.path.exists(path):
             print(
                 f"Image not found at {path}, generating random noise. Warning: The tokenizer might to work properly."
@@ -636,7 +748,7 @@ if __name__ == "__main__":
         "--checkpoint", type=str, default=None, help="Path to model checkpoint"
     )
     parser.add_argument(
-        "--image", type=str, default="assets/demo1.png", help="Path to input image"
     )
     parser.add_argument(
         "--output",
@@ -653,16 +765,22 @@ if __name__ == "__main__":
     args = parser.parse_args()
-    cs_discrete8_wan = {
         "dim": 64,
         "z_dim": 16,
-        "dim_mult": [1, 2, 4, 4],
-        "num_res_blocks": 2,
         "attn_scales": [],
         "dropout": 0.0,
         "embedding_dim": 16,
         "levels": [8, 8, 8, 5, 5, 5],
         "dtype": torch.float,
         "num_codebooks": 1,
         "K": 2,
     }
@@ -670,7 +788,7 @@ if __name__ == "__main__":
     device = args.device
     print(f"Running on {device}")
-    vae = DiscreteImageVAE(**cs_discrete8_wan).to(device)
     if args.checkpoint and os.path.exists(args.checkpoint):
         print(f"Loading checkpoint from {args.checkpoint}")

 _PERSISTENT = True
+def patchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b c (h q) (w r) -> b (c r q) h w", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b c f (h q) (w r) -> b (c r q) f h w",
+            q=patch_size,
+            r=patch_size,
+        )
+    else:
+        raise ValueError(f"Invalid input shape: {x.shape}")
+    return x
+def unpatchify(x, patch_size):
+    if patch_size == 1:
+        return x
+    if x.dim() == 4:
+        x = rearrange(x, "b (c r q) h w -> b c (h q) (w r)", q=patch_size, r=patch_size)
+    elif x.dim() == 5:
+        x = rearrange(
+            x,
+            "b (c r q) f h w -> b c f (h q) (w r)",
+            q=patch_size,
+            r=patch_size,
+        )
+    return x
 def exists(v):
     return v is not None
         self.gamma = nn.Parameter(torch.ones(shape))
         self.bias = nn.Parameter(torch.zeros(shape)) if bias else 0.0
+    # def forward(self, x):
+    #     return (
+    #         F.normalize(x, dim=(1 if self.channel_first else -1))
+    #         * self.scale
+    #         * self.gamma
+    #         + self.bias
+    #     )
     def forward(self, x):
+        dim = 1 if self.channel_first else -1
+        rms = x.pow(2).mean(dim=dim, keepdim=True).add(1e-6).rsqrt()
+        return x * rms * self.gamma + self.bias
 class Upsample(nn.Upsample):
     def forward(self, x):
         # Fix bfloat16 support for nearest neighbor interpolation.
+        # return super().forward(x.float()).type_as(x)
+        return super().forward(x)
 class ResidualBlock2d(nn.Module):
         self.proj = nn.Conv2d(dim, dim, 1)
         nn.init.zeros_(self.proj.weight)
+    # def forward(self, x):
+    #     identity = x
+    #     b, c, h, w = x.size()
+    #     x = self.norm(x)
+    #     q, k, v = (
+    #         self.to_qkv(x)
+    #         .reshape(b, 1, c * 3, -1)
+    #         .permute(0, 1, 3, 2)
+    #         .contiguous()
+    #         .chunk(3, dim=-1)
+    #     )
+    #     x = F.scaled_dot_product_attention(q, k, v)
+    #     x = x.squeeze(1).permute(0, 2, 1).reshape(b, c, h, w)
+    #     x = self.proj(x)
+    #     return x + identity
     def forward(self, x):
         identity = x
         b, c, h, w = x.size()
+        n_heads = 1  # or c // 64
+        head_dim = c // n_heads
         x = self.norm(x)
+        qkv = self.to_qkv(x).reshape(b, 3, n_heads, head_dim, h * w)
+        q, k, v = qkv.unbind(1)  # Each: (b, n_heads, head_dim, h*w)
+        q, k, v = q.transpose(-1, -2), k.transpose(-1, -2), v.transpose(-1, -2)
+        x = F.scaled_dot_product_attention(q, k, v)  # Flash attention
+        x = x.transpose(-1, -2).reshape(b, c, h, w)
+        return self.proj(x) + identity
+class FlashAttentionBlock2d(nn.Module):
+    """Attention block using flash-attn's kernel directly."""
+    def __init__(self, dim, n_heads=8):
+        super().__init__()
+        assert dim % n_heads == 0, f"dim {dim} must be divisible by n_heads {n_heads}"
+        self.dim = dim
+        self.n_heads = n_heads
+        self.head_dim = dim // n_heads
+        self.norm = RMS_norm(dim)
+        self.to_qkv = nn.Conv2d(dim, dim * 3, 1)
+        self.proj = nn.Conv2d(dim, dim, 1)
+        nn.init.zeros_(self.proj.weight)
+    def forward(self, x):
+        from flash_attn import flash_attn_func
+        identity = x
+        b, c, h, w = x.size()
+        x = self.norm(x)
+        qkv = self.to_qkv(x)  # (b, 3*c, h, w)
+        # flash_attn_func expects (b, seqlen, nheads, headdim)
+        qkv = qkv.reshape(b, 3, self.n_heads, self.head_dim, h * w)
+        qkv = qkv.permute(0, 4, 1, 2, 3)  # (b, h*w, 3, n_heads, head_dim)
+        q, k, v = qkv.unbind(2)  # each (b, h*w, n_heads, head_dim)
+        x = flash_attn_func(q, k, v)  # (b, h*w, n_heads, head_dim)
+        x = x.reshape(b, h * w, c).permute(0, 2, 1).reshape(b, c, h, w)
+        return self.proj(x) + identity
+# Custom conv with asymmetric padding
+class AsymmetricConv2d(nn.Conv2d):
+    def forward(self, x):
+        x = F.pad(x, (0, 1, 0, 1))  # Fused with conv by torch.compile
+        return super().forward(x)
 class Resample2d(nn.Module):
         attn_scales=[],
         patch_size=1,
         in_channels=3,
+        attn_class=AttentionBlock2d,
     ):
         super().__init__()
         self.dim = dim
         self.patch_size = patch_size
         self.in_channels = in_channels
+        self.patcher = lambda x: patchify(x, patch_size=patch_size)
         # dimensions
         dims = [dim * u for u in [1] + dim_mult]
         scale = 1.0
             for _ in range(num_res_blocks):
                 downsamples.append(ResidualBlock2d(in_dim, out_dim, dropout))
                 if scale in self.attn_scales:
+                    downsamples.append(attn_class(out_dim))
                 in_dim = out_dim
             if i != len(dim_mult) - 1:
                 downsamples.append(Resample2d(out_dim, mode="downsample2d"))
         )
     def forward(self, x):
+        x = self.patcher(x)
         x = self.conv1(x)
         x = self.downsamples(x)
         x = self.middle(x)
         dropout=0.0,
         attn_scales=[],
         out_channels=3,
+        attn_class=AttentionBlock2d,
+        patch_size=1,
     ):
         super().__init__()
         self.dim = dim
         self.num_res_blocks = num_res_blocks
         self.attn_scales = attn_scales
         self.out_channels = out_channels
+        self.patch_size = patch_size
+        self.unpatcher = lambda x: unpatchify(x, patch_size=patch_size)
         # dimensions (mirror of encoder)
         base = dim * dim_mult[-1]
         dims = [base] + [dim * u for u in dim_mult[::-1]]
         scale = 1.0 / (2 ** (len(dim_mult) - 2)) if len(dim_mult) >= 2 else 1.0
+        output_channels = self.out_channels * self.patch_size * self.patch_size
         # init block
         self.conv1 = nn.Conv2d(z_dim, dims[0], kernel_size=3, padding=1)
             for _ in range(num_res_blocks):
                 upsamples.append(ResidualBlock2d(in_dim, out_dim, dropout))
                 if scale in self.attn_scales:
+                    upsamples.append(attn_class(out_dim))
                 in_dim = out_dim
             if i != len(dim_mult) - 1:
                 upsamples.append(Resample2d(out_dim, mode="upsample2d"))
         x = self.middle(x)
         x = self.upsamples(x)
         x = self.head(x)
+        x = self.unpatcher(x)
         return x
         out_channels=3,
         embedding_dim=128,
         scale=None,
+        attn_class=AttentionBlock2d,
+        patch_size=1,
         *args,
         **kwargs,
     ):
             dropout=dropout,
             attn_scales=attn_scales,
             in_channels=in_channels,
+            attn_class=attn_class,
+            patch_size=patch_size,
         )
         self.decoder = Decoder2d(
             dim=dim,
             dropout=dropout,
             attn_scales=attn_scales,
             out_channels=out_channels,
+            attn_class=attn_class,
+            patch_size=patch_size,
         )
         self.embedding_dim = embedding_dim
     from PIL import Image
     import numpy as np
+    def load_image(path, size=(1920, 1080)):
         if not os.path.exists(path):
             print(
                 f"Image not found at {path}, generating random noise. Warning: The tokenizer might to work properly."
         "--checkpoint", type=str, default=None, help="Path to model checkpoint"
     )
     parser.add_argument(
+        "--image", type=str, default="assets/00128.png", help="Path to input image"
     )
     parser.add_argument(
         "--output",
     args = parser.parse_args()
+    cs_discrete8_wan_patch2 = {
         "dim": 64,
         "z_dim": 16,
+        "dim_mult": [1, 2, 4],
+        "patch_size": 2,
+        "num_res_blocks": 3,
         "attn_scales": [],
         "dropout": 0.0,
+        "cls": DiscreteImageVAE,
+        "z_channels": 256,
+        "z_factor": 1,
         "embedding_dim": 16,
         "levels": [8, 8, 8, 5, 5, 5],
         "dtype": torch.float,
+        "model_type": "wan_2_1",
+        "quantizer_cls": ChannelSplitFSQ,
         "num_codebooks": 1,
         "K": 2,
     }
     device = args.device
     print(f"Running on {device}")
+    vae = DiscreteImageVAE(**cs_discrete8_wan_patch2).to(device)
     if args.checkpoint and os.path.exists(args.checkpoint):
         print(f"Loading checkpoint from {args.checkpoint}")

specs.txt CHANGED Viewed

@@ -1,13 +1,17 @@
-PSNR: 30.78 ± 3.49
-SSIM: 0.898 ± 0.063
-LPIPS: 0.123 ± 0.033
 Latent dims: [1, 2, H/8, W/8]
-[480p]
-height: 480
-width: 848
 [540p]
 height: 536
-width: 960

+PSNR: 34.61 ± 3.18
+SSIM: 0.961 ± 0.026
+LPIPS: 0.105 ± 0.026
 Latent dims: [1, 2, H/8, W/8]
 [540p]
 height: 536
+width: 960
+[720p]
+height: 720
+width: 1280
+[1080p]
+height: 1080
+width: 1280