init

Browse files

Files changed (6) hide show

.gitattributes +1 -0
README.md +81 -0
assets/robustness.png +3 -0
autoencoder.py +134 -0
config.json +40 -0
diffusion_pytorch_model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/robustness.png filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,81 @@

+---
+license: apache-2.0
+tags:
+- NextStep
+- Image Tokenizer
+---
+# Improved Image Tokenizer
+This is an improved image tokenizer of NextStep-1,  featuring a fine-tuned decoder with a frozen encoder. The decoder refinement **improves performance** while preserving robust reconstruction quality. We **recommend using this Image Tokenizer** for optimal results with NextStep-1 models.
+## Usage
+```py
+import torch
+from PIL import Image
+import numpy as np
+import torchvision.transforms as transforms
+from autoencoder import AutoencoderKLNextStep
+device = "cuda"
+dtype = torch.bfloat16
+model_path = "/path/to/vae_dir"
+vae = AutoencoderKLNextStep.from_pretrained(model_path).to(device=device, dtype=dtype)
+pil2tensor = transforms.Compose(
+    [
+        transforms.ToTensor(),
+        transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5]),
+    ]
+)
+image = Image.open("/path/to/image.jpg")
+pixel_values = pil2tensor(image).unsqueeze(0).to(device=device, dtype=dtype)
+# encode
+latents = vae.encode(pixel_values).latent_dist.sample()
+# decode
+sampled_images = vae.decode(latents).sample
+sampled_images = sampled_images.detach().cpu().to(torch.float32)
+def tensor_to_pil(tensor):
+    image = tensor.detach().cpu().to(torch.float32)
+    image = (image / 2 + 0.5).clamp(0, 1)
+    image = image.mul(255).round().to(dtype=torch.uint8)
+    image = image.permute(1, 2, 0).numpy()
+    return Image.fromarray(image, mode="RGB")
+rec_image = tensor_to_pil(sampled_images[0])
+rec_image.save("/path/to/output.jpg")
+```
+## Evaluation
+### Reconstruction Performance on ImageNet-1K 256×256
+| Tokenizer                 | Latent Shape | PSNR ↑    | SSIM ↑   |
+| ------------------------- | ------------ | --------- | -------- |
+| **Discrete Tokenizers**   |              |           |          |
+| SBER-MoVQGAN (270M)       | 32×32        | 27.04     | 0.74     |
+| LlamaGen                  | 32×32        | 24.44     | 0.77     |
+| VAR                       | 680          | 22.12     | 0.62     |
+| TiTok-S-128               | 128          | 17.52     | 0.44     |
+| Sefltok                   | 1024         | 26.30     | 0.81     |
+| **Continuous Tokenizers** |              |           |          |
+| Stable Diffusion 1.5      | 32×32×4      | 25.18     | 0.73     |
+| Stable Diffusion XL       | 32×32×4      | 26.22     | 0.77     |
+| Stable Diffusion 3 Medium | 32×32×16     | 30.00     | 0.88     |
+| Flux.1-dev                | 32×32×16     | 31.64     | 0.91     |
+| **NextStep-1**            | **32×32×16** | **30.60** | **0.89** |
+### Robustness of NextStep-1-f8ch16-Tokenizer
+Impact of Noise Perturbation on Image Tokenizer Performance. The top panel displays
+quantitative metrics (rFID↓, PSNR↑, and SSIM↑) versus noise intensity. The bottom panel presents qualitative reconstruction examples at noise standard deviations of 0.2 and 0.5.
+<div align='center'>
+<img src="assets/robustness.png" class="interpolation-image" alt="arch." width="100%" />
+</div>

assets/robustness.png ADDED Viewed

Git LFS Details

SHA256: bb814f6477b339a07c78296033fb81c134ce57d6e83d4fd061478ef7701f9fba
Pointer size: 132 Bytes
Size of remote file: 8.14 MB

autoencoder.py ADDED Viewed

	@@ -0,0 +1,134 @@

+from typing import Dict, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers import AutoencoderKL
+from diffusers.configuration_utils import register_to_config
+from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDistribution
+from diffusers.models.modeling_outputs import AutoencoderKLOutput
+from diffusers.utils.accelerate_utils import apply_forward_hook
+class AutoencoderKLNextStep(AutoencoderKL):
+    @register_to_config
+    def __init__(
+        self,
+        in_channels: int = 3,
+        out_channels: int = 3,
+        down_block_types: Tuple[str] = ("DownEncoderBlock2D",),
+        up_block_types: Tuple[str] = ("UpDecoderBlock2D",),
+        block_out_channels: Tuple[int] = (64,),
+        layers_per_block: int = 1,
+        act_fn: str = "silu",
+        latent_channels: int = 4,
+        norm_num_groups: int = 32,
+        sample_size: int = 32,
+        scaling_factor: float = 0.18215,
+        shift_factor: Optional[float] = None,
+        latents_mean: Optional[Tuple[float]] = None,
+        latents_std: Optional[Tuple[float]] = None,
+        force_upcast: bool = True,
+        use_quant_conv: bool = True,
+        use_post_quant_conv: bool = True,
+        mid_block_add_attention: bool = True,
+        deterministic: bool = False,
+        normalize_latents: bool = False,
+        patch_size: Optional[int] = None,
+    ):
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            down_block_types=down_block_types,
+            up_block_types=up_block_types,
+            block_out_channels=block_out_channels,
+            layers_per_block=layers_per_block,
+            act_fn=act_fn,
+            latent_channels=latent_channels,
+            norm_num_groups=norm_num_groups,
+            sample_size=sample_size,
+            scaling_factor=scaling_factor,
+            shift_factor=shift_factor,
+            latents_mean=latents_mean,
+            latents_std=latents_std,
+            force_upcast=force_upcast,
+            use_quant_conv=use_quant_conv,
+            use_post_quant_conv=use_post_quant_conv,
+            mid_block_add_attention=mid_block_add_attention,
+        )
+        self.deterministic = deterministic
+        self.normalize_latents = normalize_latents
+        self.patch_size = patch_size
+    def patchify(self, x: torch.Tensor) -> torch.Tensor:
+        b, c, h, w = x.shape
+        p = self.patch_size
+        h_, w_ = h // p, w // p
+        x = x.reshape(b, c, h_, p, w_, p)
+        x = torch.einsum("bchpwq->bcpqhw", x)
+        x = x.reshape(b, c * p ** 2, h_,  w_)
+        return x
+    def unpatchify(self, x: torch.Tensor) -> torch.Tensor:
+        b, _, h_, w_ = x.shape
+        p = self.patch_size
+        c = x.shape[1] // (p ** 2)
+        x = x.reshape(b, c, p, p, h_, w_)
+        x = torch.einsum("bcpqhw->bchpwq", x)
+        x = x.reshape(b, c, h_ * p, w_ * p)
+        return x
+    @apply_forward_hook
+    def encode(
+        self, x: torch.Tensor, return_dict: bool = True
+    ) -> Union[AutoencoderKLOutput, Tuple[DiagonalGaussianDistribution]]:
+        if self.use_slicing and x.shape[0] > 1:
+            encoded_slices = [self._encode(x_slice) for x_slice in x.split(1)]
+            h = torch.cat(encoded_slices)
+        else:
+            h = self._encode(x)
+        mean, logvar = torch.chunk(h, 2, dim=1)
+        if self.patch_size is not None:
+            mean = self.patchify(mean)
+        if self.normalize_latents:
+            mean = mean.permute(0, 2, 3, 1)
+            mean = F.layer_norm(mean, mean.shape[-1:], eps=1e-6)
+            mean = mean.permute(0, 3, 1, 2)
+        if self.patch_size is not None:
+            mean = self.unpatchify(mean)
+        h = torch.cat([mean, logvar], dim=1).contiguous()
+        posterior = DiagonalGaussianDistribution(h, deterministic=self.deterministic)
+        if not return_dict:
+            return (posterior,)
+        return AutoencoderKLOutput(latent_dist=posterior)
+    def forward(
+        self,
+        sample: torch.Tensor,
+        sample_posterior: bool = False,
+        return_dict: bool = True,
+        generator: Optional[torch.Generator] = None,
+        noise_strength: float = 0.0,
+    ) -> Union[DecoderOutput, torch.Tensor]:
+        x = sample
+        posterior = self.encode(x).latent_dist
+        if sample_posterior:
+            z = posterior.sample(generator=generator)
+        else:
+            z = posterior.mode()
+        if noise_strength > 0.0:
+            p = torch.distributions.Uniform(0, noise_strength)
+            z = z + p.sample((z.shape[0],)).reshape(-1, 1, 1, 1).to(z.device) * randn_tensor(
+                z.shape, device=z.device, dtype=z.dtype
+            )
+        dec = self.decode(z).sample
+        if not return_dict:
+            return (dec,)
+        return DecoderOutput(sample=dec)

config.json ADDED Viewed

	@@ -0,0 +1,40 @@

+{
+  "_class_name": "AutoencoderKLNextStep",
+  "_diffusers_version": "0.35.0.dev0",
+  "act_fn": "silu",
+  "block_out_channels": [
+    128,
+    256,
+    512,
+    512
+  ],
+  "down_block_types": [
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D",
+    "DownEncoderBlock2D"
+  ],
+  "force_upcast": true,
+  "in_channels": 3,
+  "latent_channels": 16,
+  "latents_mean": null,
+  "latents_std": null,
+  "layers_per_block": 2,
+  "mid_block_add_attention": true,
+  "norm_num_groups": 32,
+  "out_channels": 3,
+  "sample_size": 512,
+  "scaling_factor": 1,
+  "shift_factor": 0,
+  "up_block_types": [
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D",
+    "UpDecoderBlock2D"
+  ],
+  "use_post_quant_conv": false,
+  "use_quant_conv": false,
+  "deterministic": true,
+  "normalize_latents": true,
+  "patch_size": 2
+}

diffusion_pytorch_model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d67ef6afe4ec377d53e99b270cf9a5f346f4c21dfe00732e2043b5b4c42ba394
+size 335306212