Spaces:

bmarci
/

NextStep-1-Large

Running on Zero

App Files Files Community

bmarci commited on Aug 23

Commit

b81a188

1 Parent(s): 996db64

fix

Browse files

Files changed (3) hide show

app.py +31 -30
vae/config.json +1 -2
vae/nextstep_ae.py +39 -72

app.py CHANGED Viewed

@@ -3,9 +3,8 @@ import numpy as np
 import random
 import spaces
 from PIL import Image
-# import spaces #[uncomment to use ZeroGPU]
 import torch
 from transformers import AutoTokenizer, AutoModel
 from models.gen_pipeline import NextStepPipeline
@@ -15,8 +14,15 @@ HF_HUB = "stepfun-ai/NextStep-1-Large"
 device = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained(HF_HUB, local_files_only=False, trust_remote_code=True)
-model = AutoModel.from_pretrained(HF_HUB, local_files_only=False, trust_remote_code=True)
-pipeline = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device)
 MAX_SEED = np.iinfo(np.int16).max
 MAX_IMAGE_SIZE = 512
@@ -30,8 +36,6 @@ def infer(
     seed=0,
     width=512,
     height=512,
-    #text_cfg=7.5,
-    #img_cfg=1.0,
     num_inference_steps=28,
     positive_prompt=DEFAULT_POSITIVE_PROMPT,
     negative_prompt=DEFAULT_NEGATIVE_PROMPT,
@@ -40,21 +44,23 @@ def infer(
     if prompt in [None, ""]:
         gr.Warning("⚠️ Please enter a prompt!")
         return None
-    image = pipeline.generate_image(
-        editing_caption,
-        hw=(height, width),
-        num_images_per_caption=1,
-        positive_prompt=positive_prompt,
-        negative_prompt=negative_prompt,
-        cfg=7.5,
-        cfg_img=1.0,
-        cfg_schedule="constant",
-        use_norm=False,
-        num_sampling_steps=num_inference_steps,
-        timesteps_shift=1.0,
-        seed=seed,
-        progress=True,
-    )
     return image[0]
@@ -114,7 +120,7 @@ with gr.Blocks(css=css) as demo:
                         step=1,
                         value=28,
                     )
                 with gr.Row():
                     width = gr.Slider(
                         label="Width",
@@ -132,9 +138,7 @@ with gr.Blocks(css=css) as demo:
                     )
         with gr.Row():
-            result_1 = gr.Image(label="Result 1", show_label=False, container=True, height=400, visible=False)
-        #gr.Examples(examples=examples, inputs=[prompt, ref])
     def show_result():
         return gr.update(visible=True)
@@ -147,15 +151,13 @@ with gr.Blocks(css=css) as demo:
             seed,
             width,
             height,
-            #text_cfg,
-            #img_cfg,
             num_inference_steps,
             positive_prompt,
             negative_prompt,
         ],
         outputs=[result_1],
     )
     cancel_button.click(
         fn=None,
         inputs=None,
@@ -169,6 +171,5 @@ with gr.Blocks(css=css) as demo:
         outputs=[result_1],
     )
 if __name__ == "__main__":
-    demo.launch()

 import random
 import spaces
 from PIL import Image
 import torch
+from torch.amp import autocast
 from transformers import AutoTokenizer, AutoModel
 from models.gen_pipeline import NextStepPipeline
 device = "cuda" if torch.cuda.is_available() else "cpu"
 tokenizer = AutoTokenizer.from_pretrained(HF_HUB, local_files_only=False, trust_remote_code=True)
+model = AutoModel.from_pretrained(
+    HF_HUB,
+    local_files_only=False,
+    trust_remote_code=True,
+    torch_dtype=torch.bfloat16,
+).to(device)
+pipeline = NextStepPipeline(tokenizer=tokenizer, model=model).to(device=device, dtype=torch.bfloat16)
 MAX_SEED = np.iinfo(np.int16).max
 MAX_IMAGE_SIZE = 512
     seed=0,
     width=512,
     height=512,
     num_inference_steps=28,
     positive_prompt=DEFAULT_POSITIVE_PROMPT,
     negative_prompt=DEFAULT_NEGATIVE_PROMPT,
     if prompt in [None, ""]:
         gr.Warning("⚠️ Please enter a prompt!")
         return None
+    with autocast(device_type=("cuda" if device == "cuda" else "cpu"), dtype=torch.bfloat16):
+        image = pipeline.generate_image(
+            prompt,
+            hw=(height, width),
+            num_images_per_caption=1,
+            positive_prompt=positive_prompt,
+            negative_prompt=negative_prompt,
+            cfg=7.5,
+            cfg_img=1.0,
+            cfg_schedule="constant",
+            use_norm=False,
+            num_sampling_steps=num_inference_steps,
+            timesteps_shift=1.0,
+            seed=seed,
+            progress=True,
+        )
     return image[0]
                         step=1,
                         value=28,
                     )
                 with gr.Row():
                     width = gr.Slider(
                         label="Width",
                     )
         with gr.Row():
+            result_1 = gr.Image(label="Result 1", show_label=False, container=True, height=MAX_IMAGE_SIZE, visible=False)
     def show_result():
         return gr.update(visible=True)
             seed,
             width,
             height,
             num_inference_steps,
             positive_prompt,
             negative_prompt,
         ],
         outputs=[result_1],
     )
     cancel_button.click(
         fn=None,
         inputs=None,
         outputs=[result_1],
     )
 if __name__ == "__main__":
+    demo.launch()

vae/config.json CHANGED Viewed

@@ -9,7 +9,6 @@
     "shift_factor": 0,
     "scaling_factor": 1,
     "deterministic": true,
-    "norm_fn": "layer_norm",
-    "norm_level": "channel",
     "psz": 1
 }

     "shift_factor": 0,
     "scaling_factor": 1,
     "deterministic": true,
+    "encoder_norm": true,
     "psz": 1
 }

vae/nextstep_ae.py CHANGED Viewed

@@ -2,7 +2,6 @@ import os
 import json
 import inspect
 from dataclasses import dataclass, field, asdict
-from typing import Literal
 from loguru import logger
 from omegaconf import OmegaConf
 from tabulate import tabulate
@@ -10,6 +9,7 @@ from einops import rearrange
 import torch
 import torch.nn as nn
 from torch import Tensor
 from torch.utils.checkpoint import checkpoint
@@ -17,7 +17,7 @@ from diffusers.models.autoencoders.vae import DecoderOutput, DiagonalGaussianDis
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
 from utils.misc import LargeInt
-from utils.model_utils import identity, rms_norm, layer_norm, randn_tensor, expand_t
 from utils.compile_utils import smart_compile
@@ -33,8 +33,7 @@ class AutoEncoderParams:
     scaling_factor: float = 0.3611
     shift_factor: float = 0.1159
     deterministic: bool = False
-    norm_fn: Literal["layer_norm", "rms_norm"] | None = None
-    norm_level: Literal["latent", "channel"] = "latent"
     psz: int | None = None
@@ -306,6 +305,14 @@ class Decoder(nn.Module):
         return h
 class AutoencoderKL(nn.Module):
     def __init__(self, params: AutoEncoderParams):
         super().__init__()
@@ -333,19 +340,8 @@ class AutoencoderKL(nn.Module):
             z_channels=params.z_channels,
         )
         self.psz = params.psz
-        # if self.psz is not None:
-        #     logger.warning("psz has been deprecated, this is only used for hack's vae")
-        if params.norm_fn is None:
-            self.norm_fn = identity
-        elif params.norm_fn == "layer_norm":
-            self.norm_fn = layer_norm
-        elif params.norm_fn == "rms_norm":
-            self.norm_fn = rms_norm
-        else:
-            raise ValueError(f"Invalid norm_fn: {params.norm_fn}")
-        self.norm_level = params.norm_level
         self.apply(self._init_weights)
@@ -420,18 +416,17 @@ class AutoencoderKL(nn.Module):
     def encode(self, x: torch.Tensor, return_dict: bool = True):
         moments = self.encoder(x)
-        if self.norm_fn is not None:
-            mean, logvar = torch.chunk(moments, 2, dim=1)
-            if self.psz is not None:  # HACK
-                mean = self.patchify(mean)
-            if self.norm_level == "latent":
-                mean = self.norm_fn(mean, mean.size()[1:])
-            elif self.norm_level == "channel":
-                mean = mean.permute(0, 2, 3, 1)  # [bsz, c, h, w] --> [bsz, h, w, c]
-                mean = self.norm_fn(mean, mean.size()[-1:]).permute(0, 3, 1, 2)  # [bsz, h, w, c] --> [bsz, c, h, w]
-            if self.psz is not None:  # HACK
-                mean = self.unpatchify(mean)
-            moments = torch.cat([mean, logvar], dim=1).contiguous()
         posterior = DiagonalGaussianDistribution(moments, deterministic=self.params.deterministic)
@@ -448,14 +443,7 @@ class AutoencoderKL(nn.Module):
         return DecoderOutput(sample=dec)
-    def forward(
-        self,
-        input,
-        sample_posterior=True,
-        noise_strength=0.0,
-        interpolative_noise=False,
-        t_dist: Literal["uniform", "logitnormal"] = "logitnormal",
-    ):
         posterior = self.encode(input).latent_dist
         z = posterior.sample() if sample_posterior else posterior.mode()
         if noise_strength > 0.0:
@@ -463,46 +451,25 @@ class AutoencoderKL(nn.Module):
             z = z + p.sample((z.shape[0],)).reshape(-1, 1, 1, 1).to(z.device) * randn_tensor(
                 z.shape, device=z.device, dtype=z.dtype
             )
-        if interpolative_noise:
-            z = self.patchify(z)
-            bsz, c, h, w = z.shape
-            z = z.permute(0, 2, 3, 1)  # [bsz, h, w, c]
-            z = z.reshape(-1, c)  # [bsz * h * w, c]
-            if t_dist == "logitnormal":
-                u = torch.normal(mean=0.0, std=1.0, size=(z.shape[0],))
-                t = (1 / (1 + torch.exp(-u))).to(z)
-            elif t_dist == "uniform":
-                t = torch.randn((z.shape[0],)).to(z)
-            else:
-                raise ValueError(f"Invalid t_dist: {t_dist}")
-            noise = torch.randn_like(z)
-            z = expand_t(t, z) * z + (1 - expand_t(t, z)) * noise
-            z = z.reshape(bsz, h, w, c).permute(0, 3, 1, 2)
-            z = self.unpatchify(z)
         dec = self.decode(z).sample
         return dec, posterior
     @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: str = "flux-vae", **kwargs):
-        config_path = None
-        ckpt_path = pretrained_model_name_or_path
-        if ckpt_path is not None and os.path.isdir(ckpt_path):
-            config_path = os.path.join(ckpt_path, "config.json")
-            ckpt_path = os.path.join(ckpt_path, "checkpoint.pt")
-        state_dict = torch.load(ckpt_path, map_location="cpu") if ckpt_path is not None else None
-        if kwargs is None:
-            kwargs = {}
-        if config_path is not None:
-            with open(config_path, "r") as f:
-                config: dict = json.load(f)
-            config.update(kwargs)
-            kwargs = config
         # Filter out kwargs that are not in AutoEncoderParams
         # This ensures we only pass parameters that the model can accept

 import json
 import inspect
 from dataclasses import dataclass, field, asdict
 from loguru import logger
 from omegaconf import OmegaConf
 from tabulate import tabulate
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from torch import Tensor
 from torch.utils.checkpoint import checkpoint
 from diffusers.models.modeling_outputs import AutoencoderKLOutput
 from utils.misc import LargeInt
+from utils.model_utils import randn_tensor
 from utils.compile_utils import smart_compile
     scaling_factor: float = 0.3611
     shift_factor: float = 0.1159
     deterministic: bool = False
+    encoder_norm: bool = False
     psz: int | None = None
         return h
+def layer_norm_2d(input: torch.Tensor, normalized_shape: torch.Size, eps: float = 1e-6) -> torch.Tensor:
+    # input.shape = (bsz, c, h, w)
+    _input = input.permute(0, 2, 3, 1)
+    _input = F.layer_norm(_input, normalized_shape, None, None, eps)
+    _input = _input.permute(0, 3, 1, 2)
+    return _input
 class AutoencoderKL(nn.Module):
     def __init__(self, params: AutoEncoderParams):
         super().__init__()
             z_channels=params.z_channels,
         )
+        self.encoder_norm = params.encoder_norm
         self.psz = params.psz
         self.apply(self._init_weights)
     def encode(self, x: torch.Tensor, return_dict: bool = True):
         moments = self.encoder(x)
+        mean, logvar = torch.chunk(moments, 2, dim=1)
+        if self.psz is not None:
+            mean = self.patchify(mean)
+        if self.encoder_norm:
+            mean = layer_norm_2d(mean, mean.size()[-1:])
+        if self.psz is not None:
+            mean = self.unpatchify(mean)
+        moments = torch.cat([mean, logvar], dim=1).contiguous()
         posterior = DiagonalGaussianDistribution(moments, deterministic=self.params.deterministic)
         return DecoderOutput(sample=dec)
+    def forward(self, input, sample_posterior=True, noise_strength=0.0):
         posterior = self.encode(input).latent_dist
         z = posterior.sample() if sample_posterior else posterior.mode()
         if noise_strength > 0.0:
             z = z + p.sample((z.shape[0],)).reshape(-1, 1, 1, 1).to(z.device) * randn_tensor(
                 z.shape, device=z.device, dtype=z.dtype
             )
         dec = self.decode(z).sample
         return dec, posterior
     @classmethod
+    def from_pretrained(cls, model_path, **kwargs):
+        config_path = os.path.join(model_path, "config.json")
+        ckpt_path = os.path.join(model_path, "checkpoint.pt")
+        if not os.path.isdir(model_path) or not os.path.isfile(config_path) or not os.path.isfile(ckpt_path):
+            raise ValueError(
+                f"Invalid model path: {model_path}. The path should contain both config.json and checkpoint.pt files."
+            )
+        state_dict = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        with open(config_path, "r") as f:
+            config: dict = json.load(f)
+        config.update(kwargs)
+        kwargs = config
         # Filter out kwargs that are not in AutoEncoderParams
         # This ensures we only pass parameters that the model can accept