Spaces:

coralLight
/

Hyperparameters-are-all-you-need-UniPC-XL

Running on Zero

App Files Files Community

coralLight commited on 11 days ago

Commit

012e1d0

1 Parent(s): 680053b

add inference

Browse files

Files changed (8) hide show

NoiseTransformer.py +26 -0
SVDNoiseUnet.py +430 -0
app.py +330 -0
dpm_solver_v3.py +904 -0
free_lunch_utils.py +303 -0
requirements.txt +11 -0
sampler.py +315 -0
uni_pc.py +757 -0

NoiseTransformer.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import torch.nn as nn
+from torch.nn import functional as F
+from timm import create_model
+__all__ = ['NoiseTransformer']
+class NoiseTransformer(nn.Module):
+    def __init__(self, resolution=(128,96)):
+        super().__init__()
+        self.upsample = lambda x: F.interpolate(x, [224,224])
+        self.downsample = lambda x: F.interpolate(x, [resolution[0],resolution[1]])
+        self.upconv = nn.Conv2d(7,4,(1,1),(1,1),(0,0))
+        self.downconv = nn.Conv2d(4,3,(1,1),(1,1),(0,0))
+        # self.upconv = nn.Conv2d(7,4,(1,1),(1,1),(0,0))
+        self.swin = create_model("swin_tiny_patch4_window7_224",pretrained=True)
+    def forward(self, x, residual=False):
+        if residual:
+            x = self.upconv(self.downsample(self.swin.forward_features(self.downconv(self.upsample(x))))) + x
+        else:
+            x = self.upconv(self.downsample(self.swin.forward_features(self.downconv(self.upsample(x)))))
+        return x

SVDNoiseUnet.py ADDED Viewed

	@@ -0,0 +1,430 @@

+import torch
+import torch.nn as nn
+import einops
+from torch.nn import functional as F
+from torch.jit import Final
+from timm.layers import use_fused_attn
+from timm.models.layers import PatchEmbed, Mlp, DropPath, trunc_normal_, lecun_normal_, get_act_layer
+from abc import abstractmethod
+from NoiseTransformer import NoiseTransformer
+from einops import rearrange
+__all__ = ['SVDNoiseUnet', 'SVDNoiseUnet_Concise']
+class Attention(nn.Module):
+    fused_attn: Final[bool]
+    def __init__(
+            self,
+            dim: int,
+            num_heads: int = 8,
+            qkv_bias: bool = False,
+            qk_norm: bool = False,
+            attn_drop: float = 0.,
+            proj_drop: float = 0.,
+            norm_layer: nn.Module = nn.LayerNorm,
+    ) -> None:
+        super().__init__()
+        assert dim % num_heads == 0, 'dim should be divisible by num_heads'
+        self.num_heads = num_heads
+        self.head_dim = dim // num_heads
+        self.scale = self.head_dim ** -0.5
+        self.fused_attn = use_fused_attn()
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.q_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.k_norm = norm_layer(self.head_dim) if qk_norm else nn.Identity()
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        B, N, C = x.shape
+        qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, self.head_dim).permute(2, 0, 3, 1, 4)
+        q, k, v = qkv.unbind(0)
+        q, k = self.q_norm(q), self.k_norm(k)
+        if self.fused_attn:
+            x = F.scaled_dot_product_attention(
+                q, k, v,
+                dropout_p=self.attn_drop.p if self.training else 0.,
+            )
+        else:
+            q = q * self.scale
+            attn = q @ k.transpose(-2, -1)
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = attn @ v
+        x = x.transpose(1, 2).reshape(B, N, C)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class SVDNoiseUnet(nn.Module):
+    def __init__(self, in_channels=4, out_channels=4, resolution=(128,96)): # resolution = size // 8
+        super(SVDNoiseUnet, self).__init__()
+        _in_1 = int(resolution[0] * in_channels // 2)
+        _out_1 = int(resolution[0] * out_channels // 2)
+        _in_2 = int(resolution[1] * in_channels // 2)
+        _out_2 = int(resolution[1] * out_channels // 2)
+        self.mlp1 = nn.Sequential(
+            nn.Linear(_in_1, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out_1),
+        )
+        self.mlp2 = nn.Sequential(
+            nn.Linear(_in_2, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out_2),
+        )
+        self.mlp3 = nn.Sequential(
+            nn.Linear(_in_2, _out_2),
+        )
+        self.attention = Attention(_out_2)
+        self.bn = nn.BatchNorm1d(256)
+        self.bn2 = nn.BatchNorm1d(192)
+        self.mlp4 =  nn.Sequential(
+            nn.Linear(_out_2, 1024),
+            nn.ReLU(inplace=True),
+            nn.Linear(1024, _out_2),
+        )
+        self.ffn = nn.Sequential(
+            nn.Linear(256, 384),  # Expand
+            nn.ReLU(inplace=True),
+            nn.Linear(384, 192)   # Reduce to target size
+        )
+        self.ffn2 = nn.Sequential(
+            nn.Linear(256, 384),  # Expand
+            nn.ReLU(inplace=True),
+            nn.Linear(384, 192)   # Reduce to target size
+        )
+        # self.adaptive_pool = nn.AdaptiveAvgPool2d((256, 192))
+    def forward(self, x, residual=False):
+        b, c, h, w = x.shape
+        x = einops.rearrange(x, "b (a c)h w ->b (a h)(c w)", a=2,c=2) # x -> [1, 256, 256]
+        U, s, V = torch.linalg.svd(x) # U->[b 256 256], s-> [b 256], V->[b 256 256]
+        U_T = U.permute(0, 2, 1)
+        U_out = self.ffn(self.mlp1(U_T))
+        U_out = self.bn(U_out)
+        U_out = U_out.transpose(1, 2)
+        U_out = self.ffn2(U_out)  # [b, 256, 256] -> [b, 256, 192]
+        U_out = self.bn2(U_out)
+        U_out = U_out.transpose(1, 2)
+        # U_out = self.bn(U_out)
+        V_out = self.mlp2(V)
+        s_out = self.mlp3(s).unsqueeze(1)  # s -> [b, 1, 256]  => [b, 256, 256]
+        out = U_out + V_out + s_out
+        # print(out.size())
+        out = out.squeeze(1)
+        out = self.attention(out).mean(1)
+        out = self.mlp4(out) + s
+        diagonal_out = torch.diag_embed(out)
+        padded_diag = F.pad(diagonal_out, (0, 0, 0, 64), mode='constant', value=0)  # Shape: [b, 1, 256, 192]
+        pred = U @ padded_diag @ V
+        return einops.rearrange(pred, "b (a h)(c w) -> b (a c) h w", a=2,c=2)
+class SVDNoiseUnet64(nn.Module):
+    def __init__(self, in_channels=4, out_channels=4, resolution=64): # resolution = size // 8
+        super(SVDNoiseUnet64, self).__init__()
+        _in = int(resolution * in_channels // 2)
+        _out = int(resolution * out_channels // 2)
+        self.mlp1 = nn.Sequential(
+            nn.Linear(_in, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out),
+        )
+        self.mlp2 = nn.Sequential(
+            nn.Linear(_in, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out),
+        )
+        self.mlp3 = nn.Sequential(
+            nn.Linear(_in, _out),
+        )
+        self.attention = Attention(_out)
+        self.bn = nn.BatchNorm2d(_out)
+        self.mlp4 =  nn.Sequential(
+            nn.Linear(_out, 1024),
+            nn.ReLU(inplace=True),
+            nn.Linear(1024, _out),
+        )
+    def forward(self, x, residual=False):
+        b, c, h, w = x.shape
+        x = einops.rearrange(x, "b (a c)h w ->b (a h)(c w)", a=2,c=2) # x -> [1, 256, 256]
+        U, s, V = torch.linalg.svd(x) # U->[b 256 256], s-> [b 256], V->[b 256 256]
+        U_T = U.permute(0, 2, 1)
+        out = self.mlp1(U_T) + self.mlp2(V) + self.mlp3(s).unsqueeze(1) # s -> [b, 1, 256]  => [b, 256, 256]
+        out = self.attention(out).mean(1)
+        out = self.mlp4(out) + s
+        pred = U @ torch.diag_embed(out) @ V
+        return einops.rearrange(pred, "b (a h)(c w) -> b (a c) h w", a=2,c=2)
+class SVDNoiseUnet128(nn.Module):
+    def __init__(self, in_channels=4, out_channels=4, resolution=128): # resolution = size // 8
+        super(SVDNoiseUnet128, self).__init__()
+        _in = int(resolution * in_channels // 2)
+        _out = int(resolution * out_channels // 2)
+        self.mlp1 = nn.Sequential(
+            nn.Linear(_in, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out),
+        )
+        self.mlp2 = nn.Sequential(
+            nn.Linear(_in, 64),
+            nn.ReLU(inplace=True),
+            nn.Linear(64, _out),
+        )
+        self.mlp3 = nn.Sequential(
+            nn.Linear(_in, _out),
+        )
+        self.attention = Attention(_out)
+        self.bn = nn.BatchNorm2d(_out)
+        self.mlp4 =  nn.Sequential(
+            nn.Linear(_out, 1024),
+            nn.ReLU(inplace=True),
+            nn.Linear(1024, _out),
+        )
+    def forward(self, x, residual=False):
+        b, c, h, w = x.shape
+        x = einops.rearrange(x, "b (a c)h w ->b (a h)(c w)", a=2,c=2) # x -> [1, 256, 256]
+        U, s, V = torch.linalg.svd(x) # U->[b 256 256], s-> [b 256], V->[b 256 256]
+        U_T = U.permute(0, 2, 1)
+        out = self.mlp1(U_T) + self.mlp2(V) + self.mlp3(s).unsqueeze(1) # s -> [b, 1, 256]  => [b, 256, 256]
+        out = self.attention(out).mean(1)
+        out = self.mlp4(out) + s
+        pred = U @ torch.diag_embed(out) @ V
+        return einops.rearrange(pred, "b (a h)(c w) -> b (a c) h w", a=2,c=2)
+class SVDNoiseUnet_Concise(nn.Module):
+    def __init__(self, in_channels=4, out_channels=4, resolution=64):
+        super(SVDNoiseUnet_Concise, self).__init__()
+from diffusers.models.normalization import AdaGroupNorm
+class NPNet(nn.Module):
+      def __init__(self, model_id, pretrained_path=' ', device='cuda') -> None:
+            super(NPNet, self).__init__()
+            assert model_id in ['SD1.5', 'DreamShaper', 'DiT']
+            self.model_id = model_id
+            self.device = device
+            self.pretrained_path = pretrained_path
+            (
+                  self.unet_svd,
+                  self.unet_embedding,
+                  self.text_embedding,
+                  self._alpha,
+                  self._beta
+             ) = self.get_model()
+      def save_model(self, save_path: str):
+            """
+            Save this NPNet so that get_model() can later reload it.
+            """
+            torch.save({
+                  "unet_svd":        self.unet_svd.state_dict(),
+                  "unet_embedding":  self.unet_embedding.state_dict(),
+                  "embeeding":       self.text_embedding.state_dict(),  # matches get_model’s key
+                  "alpha":           self._alpha,
+                  "beta":            self._beta,
+            }, save_path)
+            print(f"NPNet saved to {save_path}")
+      def get_model(self):
+            unet_embedding = NoiseTransformer(resolution=(128,96)).to(self.device).to(torch.float32)
+            unet_svd = SVDNoiseUnet(resolution=(128,96)).to(self.device).to(torch.float32)
+            if self.model_id == 'DiT':
+                  text_embedding = AdaGroupNorm(768 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            else:
+                  text_embedding = AdaGroupNorm(768 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            # initialize random _alpha and _beta when no checkpoint is provided
+            _alpha = torch.randn(1, device=self.device)
+            _beta = torch.randn(1, device=self.device)
+            if '.pth' in self.pretrained_path:
+                  gloden_unet = torch.load(self.pretrained_path)
+                  unet_svd.load_state_dict(gloden_unet["unet_svd"],strict=True)
+                  unet_embedding.load_state_dict(gloden_unet["unet_embedding"],strict=True)
+                  text_embedding.load_state_dict(gloden_unet["embeeding"],strict=True)
+                  _alpha = gloden_unet["alpha"]
+                  _beta = gloden_unet["beta"]
+                  print("Load Successfully!")
+                  return unet_svd, unet_embedding, text_embedding, _alpha, _beta
+            else:
+                  return unet_svd, unet_embedding, text_embedding, _alpha, _beta
+      def forward(self, initial_noise, prompt_embeds):
+            prompt_embeds = prompt_embeds.float().view(prompt_embeds.shape[0], -1)
+            text_emb = self.text_embedding(initial_noise.float(), prompt_embeds)
+            encoder_hidden_states_svd = initial_noise
+            encoder_hidden_states_embedding = initial_noise + text_emb
+            golden_embedding = self.unet_embedding(encoder_hidden_states_embedding.float())
+            golden_noise = self.unet_svd(encoder_hidden_states_svd.float()) + (
+                        2 * torch.sigmoid(self._alpha) - 1) * text_emb + self._beta * golden_embedding
+            return golden_noise
+class NPNet64(nn.Module):
+      def __init__(self, model_id, pretrained_path=' ', device='cuda') -> None:
+            super(NPNet64, self).__init__()
+            self.model_id = model_id
+            self.device = device
+            self.pretrained_path = pretrained_path
+            (
+                  self.unet_svd,
+                  self.unet_embedding,
+                  self.text_embedding,
+                  self._alpha,
+                  self._beta
+             ) = self.get_model()
+      def save_model(self, save_path: str):
+            """
+            Save this NPNet so that get_model() can later reload it.
+            """
+            torch.save({
+                  "unet_svd":        self.unet_svd.state_dict(),
+                  "unet_embedding":  self.unet_embedding.state_dict(),
+                  "embeeding":       self.text_embedding.state_dict(),  # matches get_model’s key
+                  "alpha":           self._alpha,
+                  "beta":            self._beta,
+            }, save_path)
+            print(f"NPNet saved to {save_path}")
+      def get_model(self):
+            unet_embedding = NoiseTransformer(resolution=(64,64)).to(self.device).to(torch.float32)
+            unet_svd = SVDNoiseUnet64(resolution=64).to(self.device).to(torch.float32)
+            _alpha = torch.randn(1, device=self.device)
+            _beta = torch.randn(1, device=self.device)
+            text_embedding = AdaGroupNorm(768 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            if '.pth' in self.pretrained_path:
+                  gloden_unet = torch.load(self.pretrained_path)
+                  unet_svd.load_state_dict(gloden_unet["unet_svd"])
+                  unet_embedding.load_state_dict(gloden_unet["unet_embedding"])
+                  text_embedding.load_state_dict(gloden_unet["embeeding"])
+                  _alpha = gloden_unet["alpha"]
+                  _beta = gloden_unet["beta"]
+                  print("Load Successfully!")
+            return unet_svd, unet_embedding, text_embedding, _alpha, _beta
+      def forward(self, initial_noise, prompt_embeds):
+            prompt_embeds = prompt_embeds.float().view(prompt_embeds.shape[0], -1)
+            text_emb = self.text_embedding(initial_noise.float(), prompt_embeds)
+            encoder_hidden_states_svd = initial_noise
+            encoder_hidden_states_embedding = initial_noise + text_emb
+            golden_embedding = self.unet_embedding(encoder_hidden_states_embedding.float())
+            golden_noise = self.unet_svd(encoder_hidden_states_svd.float()) + (
+                        2 * torch.sigmoid(self._alpha) - 1) * text_emb + self._beta * golden_embedding
+            return golden_noise
+class NPNet128(nn.Module):
+      def __init__(self, model_id, pretrained_path=True, device='cuda') -> None:
+            super(NPNet128, self).__init__()
+            assert model_id in ['SDXL', 'DreamShaper', 'DiT']
+            self.model_id = model_id
+            self.device = device
+            self.pretrained_path = pretrained_path
+            (
+                  self.unet_svd,
+                  self.unet_embedding,
+                  self.text_embedding,
+                  self._alpha,
+                  self._beta
+             ) = self.get_model()
+      def get_model(self):
+            unet_embedding = NoiseTransformer(resolution=(128,128)).to(self.device).to(torch.float32)
+            unet_svd = SVDNoiseUnet128(resolution=128).to(self.device).to(torch.float32)
+            if self.model_id == 'DiT':
+                  text_embedding = AdaGroupNorm(1024 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            else:
+                  text_embedding = AdaGroupNorm(2048 * 77, 4, 1, eps=1e-6).to(self.device).to(torch.float32)
+            if '.pth' in self.pretrained_path:
+                  gloden_unet = torch.load(self.pretrained_path)
+                  unet_svd.load_state_dict(gloden_unet["unet_svd"])
+                  unet_embedding.load_state_dict(gloden_unet["unet_embedding"])
+                  text_embedding.load_state_dict(gloden_unet["embeeding"])
+                  _alpha = gloden_unet["alpha"]
+                  _beta = gloden_unet["beta"]
+                  print("Load Successfully!")
+                  return unet_svd, unet_embedding, text_embedding, _alpha, _beta
+            else:
+                  assert ("No Pretrained Weights Found!")
+      def forward(self, initial_noise, prompt_embeds):
+            prompt_embeds = prompt_embeds.float().view(prompt_embeds.shape[0], -1)
+            text_emb = self.text_embedding(initial_noise.float(), prompt_embeds)
+            encoder_hidden_states_svd = initial_noise
+            encoder_hidden_states_embedding = initial_noise + text_emb
+            golden_embedding = self.unet_embedding(encoder_hidden_states_embedding.float())
+            golden_noise = self.unet_svd(encoder_hidden_states_svd.float()) + (
+                        2 * torch.sigmoid(self._alpha) - 1) * text_emb + self._beta * golden_embedding
+            return golden_noise

app.py CHANGED Viewed

	@@ -0,0 +1,330 @@

+import gradio as gr
+import numpy as np
+import random
+import json
+import spaces #[uncomment to use ZeroGPU]
+from diffusers import (
+    AutoencoderKL,
+    StableDiffusionXLPipeline,
+)
+from huggingface_hub import login, hf_hub_download
+from PIL import Image
+# from huggingface_hub import login
+from SVDNoiseUnet import NPNet64
+import functools
+import random
+from free_lunch_utils import register_free_upblock2d, register_free_crossattn_upblock2d
+import torch
+import torch.nn as nn
+from einops import rearrange
+from torchvision.utils import make_grid
+import time
+from pytorch_lightning import seed_everything
+from torch import autocast
+from contextlib import contextmanager, nullcontext
+import accelerate
+import torchsde
+from SVDNoiseUnet import NPNet128
+from tqdm import tqdm, trange
+from itertools import islice
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model_repo_id = "Lykon/dreamshaper-xl-1-0"  # Replace to the model you would like to use
+from sampler import UniPCSampler
+precision_scope = autocast
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+def numpy_to_pil(images):
+    """
+    Convert a numpy image or a batch of images to a PIL image.
+    """
+    if images.ndim == 3:
+        images = images[None, ...]
+    images = (images * 255).round().astype("uint8")
+    pil_images = [Image.fromarray(image) for image in images]
+    return pil_images
+def load_replacement(x):
+    try:
+        hwc = x.shape
+        y = Image.open("assets/rick.jpeg").convert("RGB").resize((hwc[1], hwc[0]))
+        y = (np.array(y) / 255.0).astype(x.dtype)
+        assert y.shape == x.shape
+        return y
+    except Exception:
+        return x
+# Adapted from pipelines.StableDiffusionPipeline.encode_prompt
+def encode_prompt(prompt_batch, text_encoder, tokenizer, proportion_empty_prompts, is_train=True):
+    captions = []
+    for caption in prompt_batch:
+        if random.random() < proportion_empty_prompts:
+            captions.append("")
+        elif isinstance(caption, str):
+            captions.append(caption)
+        elif isinstance(caption, (list, np.ndarray)):
+            # take a random caption if there are multiple
+            captions.append(random.choice(caption) if is_train else caption[0])
+    with torch.no_grad():
+        text_inputs = tokenizer(
+            captions,
+            padding="max_length",
+            max_length=tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        prompt_embeds = text_encoder(text_input_ids.to(text_encoder.device))[0]
+    return prompt_embeds
+def chunk(it, size):
+    it = iter(it)
+    return iter(lambda: tuple(islice(it, size)), ())
+def convert_caption_json_to_str(json):
+    caption = json["caption"]
+    return caption
+def prepare_sdxl_pipeline_step_parameter(pipe, prompts, need_cfg, device, negative_prompts, W = 1024, H = 1024):
+    (
+        prompt_embeds,
+        negative_prompt_embeds,
+        pooled_prompt_embeds,
+        negative_pooled_prompt_embeds,
+    ) = pipe.encode_prompt(
+        prompt=prompts,
+        negative_prompt=negative_prompts,
+        device=device,
+        do_classifier_free_guidance=need_cfg,
+    )
+    # timesteps = pipe.scheduler.timesteps
+    prompt_embeds = prompt_embeds.to(device)
+    add_text_embeds = pooled_prompt_embeds.to(device)
+    original_size = (W, H)
+    crops_coords_top_left = (0, 0)
+    target_size = (W, H)
+    text_encoder_projection_dim = None
+    add_time_ids = list(original_size + crops_coords_top_left + target_size)
+    if pipe.text_encoder_2 is None:
+        text_encoder_projection_dim = int(pooled_prompt_embeds.shape[-1])
+    else:
+        text_encoder_projection_dim = pipe.text_encoder_2.config.projection_dim
+    passed_add_embed_dim = (
+        pipe.unet.config.addition_time_embed_dim * len(add_time_ids) + text_encoder_projection_dim
+    )
+    expected_add_embed_dim = pipe.unet.add_embedding.linear_1.in_features
+    if expected_add_embed_dim != passed_add_embed_dim:
+        raise ValueError(
+            f"Model expects an added time embedding vector of length {expected_add_embed_dim}, but a vector of {passed_add_embed_dim} was created. The model has an incorrect config. Please check `unet.config.time_embedding_type` and `text_encoder_2.config.projection_dim`."
+        )
+    add_time_ids = torch.tensor([add_time_ids], dtype=prompt_embeds.dtype)
+    add_time_ids = add_time_ids.to(device)
+    negative_add_time_ids = add_time_ids
+    if need_cfg:
+        prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds], dim=0)
+        add_text_embeds = torch.cat([negative_pooled_prompt_embeds, add_text_embeds], dim=0)
+        add_time_ids = torch.cat([negative_add_time_ids, add_time_ids], dim=0)
+    ret_dict = {
+        "text_embeds": add_text_embeds,
+        "time_ids": add_time_ids
+    }
+    return prompt_embeds, ret_dict
+def model_closure(pipe):
+    def model_fn(x, t, c):
+        prompt = c[0]
+        cond_kwargs = c[1] if len(c) > 1 else None
+        # prompt_embeds, cond_kwargs = prepare_sdxl_pipeline_step_parameter(pipe=pipe,prompts = prompt, need_cfg=True, device=pipe.device,negative_prompts=negative_prompt)
+        # prompt_embeds, cond_kwargs = c
+        return pipe.unet(x
+                         , t
+                         , encoder_hidden_states=prompt.to(device=x.device, dtype=x.dtype)
+                         , added_cond_kwargs=cond_kwargs).sample
+    return model_fn
+torch_dtype = torch.float16
+repo_id = "madebyollin/sdxl-vae-fp16-fix"  # e.g., "distilbert/distilgpt2"
+vae = AutoencoderKL.from_pretrained("madebyollin/sdxl-vae-fp16-fix",torch_dtype=torch_dtype) #from_single_file(downloaded_path, torch_dtype=torch_dtype)
+vae.to('cuda')
+pipe = StableDiffusionXLPipeline.from_pretrained("John6666/illustrij-evo-lvl3-sdxl",torch_dtype=torch_dtype,vae=vae)
+    # pipe = StableDiffusionXLPipeline.from_pretrained("stabilityai/stable-diffusion-xl-base-1.0",torch_dtype=torch.float16,vae=vae)
+pipe.to('cuda')
+MAX_SEED = np.iinfo(np.int32).max
+MAX_IMAGE_SIZE = 1024
+accelerator = accelerate.Accelerator()
+def generate_image_with_steps(prompt, negative_prompt, seed, width, height, guidance_scale, num_inference_steps):
+    """Helper function to generate image with specific number of steps"""
+    prompts =  [prompt]
+    sampler = UniPCSampler(pipe,model_closure=model_closure, steps=num_inference_steps, guidance_scale=guidance_scale)
+    c = prompts
+    uc = [negative_prompt] * len(c) if guidance_scale != 1.0 else None
+    shape = [4, width // 8, height // 8]
+                    # if opt.method == "dpm_solver_v3":
+                            # batch_size, shape, conditioning, x_T, unconditional_conditioning
+    samples, _ = sampler.sample(
+        conditioning=c,
+        batch_size=1,
+        shape=shape,
+        unconditional_conditioning=uc,
+        x_T=None,
+        start_free_u_step=6 if num_inference_steps == 8 else 4,
+        xl_preprocess_closure = prepare_sdxl_pipeline_step_parameter,
+                        # npnet = npn_net,
+        use_corrector=True,
+    )
+    x_samples = pipe.vae.decode(samples / pipe.vae.config.scaling_factor).sample
+    x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
+    x_samples = x_samples.cpu().permute(0, 2, 3, 1).numpy()
+    x_image_torch = torch.from_numpy(x_samples).permute(0, 3, 1, 2) # need to pay attention
+    for x_sample in x_image_torch:
+        x_sample = 255.0 * rearrange(x_sample.cpu().numpy(), "c h w -> h w c")
+        img = Image.fromarray(x_sample.astype(np.uint8))
+    return img
+@spaces.GPU #[uncomment to use ZeroGPU]
+def infer(
+    prompt,
+    negative_prompt,
+    seed,
+    randomize_seed,
+    resolution,
+    guidance_scale,
+    num_inference_steps,
+    progress=gr.Progress(track_tqdm=True),
+):
+    if randomize_seed:
+        seed = random.randint(0, MAX_SEED)
+    # Parse resolution string into width and height
+    width, height = map(int, resolution.split('x'))
+    # Generate image with selected steps
+    image_quick = generate_image_with_steps(prompt, negative_prompt, seed, width, height, guidance_scale, num_inference_steps)
+    # Generate image with 50 steps for high quality
+    image_50_steps = generate_image_with_steps(prompt, negative_prompt, seed, width, height, guidance_scale, 50)
+    return image_quick, image_50_steps, seed
+examples = [
+    "Astronaut in a jungle, cold color, muted colors, detailed, 8k",
+    "a painting of a virus monster playing guitar",
+    "a painting of a squirrel eating a burger",
+]
+css = """
+#col-container {
+    margin: 0 auto;
+    max-width: 640px;
+}
+"""
+with gr.Blocks(css=css) as demo:
+    with gr.Column(elem_id="col-container"):
+        gr.Markdown(" # Hyperparameters are all you need")
+        with gr.Row():
+            prompt = gr.Text(
+                label="Prompt",
+                show_label=False,
+                max_lines=1,
+                placeholder="Enter your prompt",
+                container=False,
+            )
+            run_button = gr.Button("Run", scale=0, variant="primary")
+        with gr.Row():
+            with gr.Column():
+                gr.Markdown("### Our fast inference Result")
+                result = gr.Image(label="Quick Result", show_label=False)
+            with gr.Column():
+                gr.Markdown("### Original 50 steps Result")
+                result_50_steps = gr.Image(label="50 Steps Result", show_label=False)
+        with gr.Accordion("Advanced Settings", open=False):
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=1,
+                placeholder="Enter a negative prompt",
+                visible=False,
+            )
+            seed = gr.Slider(
+                label="Seed",
+                minimum=0,
+                maximum=MAX_SEED,
+                step=1,
+                value=0,
+            )
+            randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+            resolution = gr.Dropdown(
+                choices=[
+                    "1024x1024",
+                    "1216x832",
+                    "832x1216"
+                ],
+                value="1024x1024",
+                label="Resolution",
+            )
+            with gr.Row():
+                guidance_scale = gr.Slider(
+                    label="Guidance scale",
+                    minimum=0.0,
+                    maximum=10.0,
+                    step=0.1,
+                    value=7.5,  # Replace with defaults that work for your model
+                )
+                num_inference_steps = gr.Dropdown(
+                    choices=[6, 8],
+                    value=8,
+                    label="Number of inference steps",
+                )
+        gr.Examples(examples=examples, inputs=[prompt])
+    gr.on(
+        triggers=[run_button.click, prompt.submit],
+        fn=infer,
+        inputs=[
+            prompt,
+            negative_prompt,
+            seed,
+            randomize_seed,
+            resolution,
+            guidance_scale,
+            num_inference_steps,
+        ],
+        outputs=[result, result_50_steps, seed],
+    )
+if __name__ == "__main__":
+    demo.launch()

dpm_solver_v3.py ADDED Viewed

	@@ -0,0 +1,904 @@

+import torch
+import torch.nn.functional as F
+import math
+import numpy as np
+import os
+class NoiseScheduleVP:
+    def __init__(
+        self,
+        schedule="discrete",
+        betas=None,
+        alphas_cumprod=None,
+        continuous_beta_0=0.1,
+        continuous_beta_1=20.0,
+    ):
+        """Create a wrapper class for the forward SDE (VP type).
+        ***
+        Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t.
+                We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images.
+        ***
+        The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ).
+        We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper).
+        Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have:
+            log_alpha_t = self.marginal_log_mean_coeff(t)
+            sigma_t = self.marginal_std(t)
+            lambda_t = self.marginal_lambda(t)
+        Moreover, as lambda(t) is an invertible function, we also support its inverse function:
+            t = self.inverse_lambda(lambda_t)
+        ===============================================================
+        We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]).
+        1. For discrete-time DPMs:
+            For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by:
+                t_i = (i + 1) / N
+            e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1.
+            We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3.
+            Args:
+                betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details)
+                alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details)
+            Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`.
+            **Important**:  Please pay special attention for the args for `alphas_cumprod`:
+                The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that
+                    q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ).
+                Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have
+                    alpha_{t_n} = \sqrt{\hat{alpha_n}},
+                and
+                    log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}).
+        2. For continuous-time DPMs:
+            We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise
+            schedule are the default settings in DDPM and improved-DDPM:
+            Args:
+                beta_min: A `float` number. The smallest beta for the linear schedule.
+                beta_max: A `float` number. The largest beta for the linear schedule.
+                cosine_s: A `float` number. The hyperparameter in the cosine schedule.
+                cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule.
+                T: A `float` number. The ending time of the forward process.
+        ===============================================================
+        Args:
+            schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs,
+                    'linear' or 'cosine' for continuous-time DPMs.
+        Returns:
+            A wrapper object of the forward SDE (VP type).
+        ===============================================================
+        Example:
+        # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', betas=betas)
+        # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1):
+        >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod)
+        # For continuous-time DPMs (VPSDE), linear schedule:
+        >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.)
+        """
+        if schedule not in ["discrete", "linear", "cosine"]:
+            raise ValueError(
+                "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format(
+                    schedule
+                )
+            )
+        self.alphas_cumprod = alphas_cumprod
+        self.sigmas = ((1 - alphas_cumprod) / alphas_cumprod) ** 0.5
+        self.log_sigmas = self.sigmas.log()
+        self.schedule = schedule
+        if schedule == "discrete":
+            if betas is not None:
+                log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0)
+            else:
+                assert alphas_cumprod is not None
+                log_alphas = 0.5 * torch.log(alphas_cumprod)
+            self.total_N = len(log_alphas)
+            self.T = 1.0
+            self.t_array = torch.linspace(0.0, 1.0, self.total_N + 1)[1:].reshape((1, -1))
+            self.log_alpha_array = log_alphas.reshape(
+                (
+                    1,
+                    -1,
+                )
+            )
+        else:
+            self.total_N = 1000
+            self.beta_0 = continuous_beta_0
+            self.beta_1 = continuous_beta_1
+            self.cosine_s = 0.008
+            self.cosine_beta_max = 999.0
+            self.cosine_t_max = (
+                math.atan(self.cosine_beta_max * (1.0 + self.cosine_s) / math.pi)
+                * 2.0
+                * (1.0 + self.cosine_s)
+                / math.pi
+                - self.cosine_s
+            )
+            self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1.0 + self.cosine_s) * math.pi / 2.0))
+            self.schedule = schedule
+            if schedule == "cosine":
+                # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T.
+                # Note that T = 0.9946 may be not the optimal setting. However, we find it works well.
+                self.T = 0.9946
+            else:
+                self.T = 1.0
+    def marginal_log_mean_coeff(self, t):
+        """
+        Compute log(alpha_t) of a given continuous-time label t in [0, T].
+        """
+        if self.schedule == "discrete":
+            return interpolate_fn(
+                t.reshape((-1, 1)), self.t_array.to(t.device), self.log_alpha_array.to(t.device)
+            ).reshape((-1))
+        elif self.schedule == "linear":
+            return -0.25 * t**2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0
+        elif self.schedule == "cosine":
+            log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1.0 + self.cosine_s) * math.pi / 2.0))
+            log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0
+            return log_alpha_t
+    def sigma_to_t(self, sigma, quantize=None):
+        quantize = None
+        log_sigma = sigma.log()
+        dists = log_sigma - self.log_sigmas[:, None]
+        if quantize:
+            return dists.abs().argmin(dim=0).view(sigma.shape)
+        low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low, high = self.log_sigmas[low_idx], self.log_sigmas[high_idx]
+        w = (low - log_sigma) / (low - high)
+        w = w.clamp(0, 1)
+        t = (1 - w) * low_idx + w * high_idx
+        return t.view(sigma.shape)
+    def get_special_sigmas_with_timesteps(self,timesteps):
+        low_idx, high_idx, w = np.minimum(np.floor(timesteps),999), np.minimum(np.ceil(timesteps),999), torch.from_numpy( timesteps - np.floor(timesteps))
+        self.alphas_cumprod = self.alphas_cumprod.to('cpu')
+        alphas = (1 - w) * self.alphas_cumprod[low_idx] + w * self.alphas_cumprod[high_idx]
+        return ((1 - alphas) / alphas) ** 0.5
+    def marginal_alpha(self, t):
+        """
+        Compute alpha_t of a given continuous-time label t in [0, T].
+        """
+        return torch.exp(self.marginal_log_mean_coeff(t))
+    def marginal_std(self, t):
+        """
+        Compute sigma_t of a given continuous-time label t in [0, T].
+        """
+        return torch.sqrt(1.0 - torch.exp(2.0 * self.marginal_log_mean_coeff(t)))
+    def marginal_lambda(self, t):
+        """
+        Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T].
+        """
+        log_mean_coeff = self.marginal_log_mean_coeff(t)
+        log_std = 0.5 * torch.log(1.0 - torch.exp(2.0 * log_mean_coeff))
+        return log_mean_coeff - log_std
+    def inverse_lambda(self, lamb):
+        """
+        Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t.
+        """
+        if self.schedule == "linear":
+            tmp = 2.0 * (self.beta_1 - self.beta_0) * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb))
+            Delta = self.beta_0**2 + tmp
+            return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0)
+        elif self.schedule == "discrete":
+            log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2.0 * lamb)
+            t = interpolate_fn(
+                log_alpha.reshape((-1, 1)),
+                torch.flip(self.log_alpha_array.to(lamb.device), [1]),
+                torch.flip(self.t_array.to(lamb.device), [1]),
+            )
+            return t.reshape((-1,))
+        else:
+            log_alpha = -0.5 * torch.logaddexp(-2.0 * lamb, torch.zeros((1,)).to(lamb))
+            t_fn = (
+                lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0))
+                * 2.0
+                * (1.0 + self.cosine_s)
+                / math.pi
+                - self.cosine_s
+            )
+            t = t_fn(log_alpha)
+            return t
+def model_wrapper(
+    model,
+    noise_schedule,
+    model_type="noise",
+    model_kwargs={},
+    guidance_type="uncond",
+    condition=None,
+    unconditional_condition=None,
+    guidance_scale=1.0,
+    classifier_fn=None,
+    classifier_kwargs={},
+):
+    """Create a wrapper function for the noise prediction model.
+    DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to
+    firstly wrap the model function to a noise prediction model that accepts the continuous time as the input.
+    We support four types of the diffusion model by setting `model_type`:
+        1. "noise": noise prediction model. (Trained by predicting noise).
+        2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0).
+        3. "v": velocity prediction model. (Trained by predicting the velocity).
+            The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2].
+            [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models."
+                arXiv preprint arXiv:2202.00512 (2022).
+            [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models."
+                arXiv preprint arXiv:2210.02303 (2022).
+        4. "score": marginal score function. (Trained by denoising score matching).
+            Note that the score function and the noise prediction model follows a simple relationship:
+            ```
+                noise(x_t, t) = -sigma_t * score(x_t, t)
+            ```
+    We support three types of guided sampling by DPMs by setting `guidance_type`:
+        1. "uncond": unconditional sampling by DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+        2. "classifier": classifier guidance sampling [3] by DPMs and another classifier.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, **model_kwargs) -> noise | x_start | v | score
+            ``
+            The input `classifier_fn` has the following format:
+            ``
+                classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond)
+            ``
+            [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis,"
+                in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794.
+        3. "classifier-free": classifier-free guidance sampling by conditional DPMs.
+            The input `model` has the following format:
+            ``
+                model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score
+            ``
+            And if cond == `unconditional_condition`, the model output is the unconditional DPM output.
+            [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance."
+                arXiv preprint arXiv:2207.12598 (2022).
+    The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999)
+    or continuous-time labels (i.e. epsilon to T).
+    We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise:
+    ``
+        def model_fn(x, t_continuous) -> noise:
+            t_input = get_model_input_time(t_continuous)
+            return noise_pred(model, x, t_input, **model_kwargs)
+    ``
+    where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver.
+    ===============================================================
+    Args:
+        model: A diffusion model with the corresponding format described above.
+        noise_schedule: A noise schedule object, such as NoiseScheduleVP.
+        model_type: A `str`. The parameterization type of the diffusion model.
+                    "noise" or "x_start" or "v" or "score".
+        model_kwargs: A `dict`. A dict for the other inputs of the model function.
+        guidance_type: A `str`. The type of the guidance for sampling.
+                    "uncond" or "classifier" or "classifier-free".
+        condition: A pytorch tensor. The condition for the guided sampling.
+                    Only used for "classifier" or "classifier-free" guidance type.
+        unconditional_condition: A pytorch tensor. The condition for the unconditional sampling.
+                    Only used for "classifier-free" guidance type.
+        guidance_scale: A `float`. The scale for the guided sampling.
+        classifier_fn: A classifier function. Only used for the classifier guidance.
+        classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function.
+    Returns:
+        A noise prediction model that accepts the noised data and the continuous time as the inputs.
+    """
+    def get_model_input_time(t_continuous):
+        """
+        Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time.
+        For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N].
+        For continuous-time DPMs, we just use `t_continuous`.
+        """
+        if noise_schedule.schedule == "discrete":
+            return (t_continuous - 1.0 / noise_schedule.total_N) * 1000.0
+        else:
+            return t_continuous
+    def noise_pred_fn(x, t_continuous, cond=None):
+        if t_continuous.reshape((-1,)).shape[0] == 1:
+            t_continuous = t_continuous.expand((x.shape[0]))
+        t_input = get_model_input_time(t_continuous)
+        if cond is None:
+            output = model(x, t_input, None, **model_kwargs)
+        else:
+            output = model(x, t_input, cond, **model_kwargs)
+        if model_type == "noise":
+            return output
+        elif model_type == "x_start":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims)
+        elif model_type == "v":
+            alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x
+        elif model_type == "score":
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            dims = x.dim()
+            return -expand_dims(sigma_t, dims) * output
+    def cond_grad_fn(x, t_input):
+        """
+        Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t).
+        """
+        with torch.enable_grad():
+            x_in = x.detach().requires_grad_(True)
+            log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs)
+            return torch.autograd.grad(log_prob.sum(), x_in)[0]
+    def model_fn(x, t_continuous):
+        """
+        The noise predicition model function that is used for DPM-Solver.
+        """
+        if t_continuous.reshape((-1,)).shape[0] == 1:
+            t_continuous = t_continuous.expand((x.shape[0]))
+        if guidance_type == "uncond":
+            return noise_pred_fn(x, t_continuous)
+        elif guidance_type == "classifier":
+            assert classifier_fn is not None
+            t_input = get_model_input_time(t_continuous)
+            cond_grad = cond_grad_fn(x, t_input)
+            sigma_t = noise_schedule.marginal_std(t_continuous)
+            noise = noise_pred_fn(x, t_continuous)
+            return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad
+        elif guidance_type == "classifier-free":
+            if guidance_scale == 1.0 or unconditional_condition is None:
+                return noise_pred_fn(x, t_continuous, cond=condition)
+            else:
+                x_in = torch.cat([x] * 2)
+                t_in = torch.cat([t_continuous] * 2)
+                if isinstance(condition, torch.Tensor) and ( isinstance(unconditional_condition, torch.Tensor) or unconditional_condition is None ):
+                    c_in = torch.cat([unconditional_condition, condition])
+                else:
+                    c_in = [condition, unconditional_condition]
+                # c_in = torch.cat([unconditional_condition, condition])
+                noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2)
+                return noise_uncond + guidance_scale * (noise - noise_uncond)
+    assert model_type in ["noise", "x_start", "v"]
+    assert guidance_type in ["uncond", "classifier", "classifier-free"]
+    return model_fn
+def weighted_cumsumexp_trapezoid(a, x, b, cumsum=True):
+    # ∫ b*e^a dx
+    # Input: a,x,b: shape (N+1,...)
+    # Output: y: shape (N+1,...)
+    # y_0 = 0
+    # y_n = sum_{i=1}^{n} 0.5*(x_{i}-x_{i-1})*(b_{i}*e^{a_{i}}+b_{i-1}*e^{a_{i-1}}) (n from 1 to N)
+    assert x.shape[0] == a.shape[0] and x.ndim == a.ndim
+    if b is not None:
+        assert a.shape[0] == b.shape[0] and a.ndim == b.ndim
+    a_max = np.amax(a, axis=0, keepdims=True)
+    if b is not None:
+        b = np.asarray(b)
+        tmp = b * np.exp(a - a_max)
+    else:
+        tmp = np.exp(a - a_max)
+    out = 0.5 * (x[1:] - x[:-1]) * (tmp[1:] + tmp[:-1])
+    if not cumsum:
+        return np.sum(out, axis=0) * np.exp(a_max)
+    out = np.cumsum(out, axis=0)
+    out *= np.exp(a_max)
+    return np.concatenate([np.zeros_like(out[[0]]), out], axis=0)
+def weighted_cumsumexp_trapezoid_torch(a, x, b, cumsum=True):
+    assert x.shape[0] == a.shape[0] and x.ndim == a.ndim
+    if b is not None:
+        assert a.shape[0] == b.shape[0] and a.ndim == b.ndim
+    a_max = torch.amax(a, dim=0, keepdims=True)
+    if b is not None:
+        tmp = b * torch.exp(a - a_max)
+    else:
+        tmp = torch.exp(a - a_max)
+    out = 0.5 * (x[1:] - x[:-1]) * (tmp[1:] + tmp[:-1])
+    if not cumsum:
+        return torch.sum(out, dim=0) * torch.exp(a_max)
+    out = torch.cumsum(out, dim=0)
+    out *= torch.exp(a_max)
+    return torch.concat([torch.zeros_like(out[[0]]), out], dim=0)
+def index_list(lst, index):
+    new_lst = []
+    for i in index:
+        new_lst.append(lst[i])
+    return new_lst
+class DPM_Solver_v3:
+    def __init__(
+        self,
+        statistics_dir,
+        noise_schedule,
+        steps=10,
+        t_start=None,
+        t_end=None,
+        skip_type="time_uniform",
+        degenerated=False,
+        device="cuda",
+    ):
+        self.device = device
+        self.model = None
+        self.noise_schedule = noise_schedule
+        self.steps = steps
+        t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        assert (
+            t_0 > 0 and t_T > 0
+        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        l = np.load(os.path.join(statistics_dir, "l.npz"))["l"]
+        sb = np.load(os.path.join(statistics_dir, "sb.npz"))
+        s, b = sb["s"], sb["b"]
+        if degenerated:
+            l = np.ones_like(l)
+            s = np.zeros_like(s)
+            b = np.zeros_like(b)
+        self.statistics_steps = l.shape[0] - 1
+        ts = noise_schedule.marginal_lambda(
+            self.get_time_steps("logSNR", t_T, t_0, self.statistics_steps, "cpu")
+        ).numpy()[:, None, None, None]
+        self.ts = torch.from_numpy(ts).cuda()
+        self.lambda_T = self.ts[0].cpu().item()
+        self.lambda_0 = self.ts[-1].cpu().item()
+        z = np.zeros_like(l)
+        o = np.ones_like(l)
+        L = weighted_cumsumexp_trapezoid(z, ts, l)
+        S = weighted_cumsumexp_trapezoid(z, ts, s)
+        I = weighted_cumsumexp_trapezoid(L + S, ts, o)
+        B = weighted_cumsumexp_trapezoid(-S, ts, b)
+        C = weighted_cumsumexp_trapezoid(L + S, ts, B)
+        self.l = torch.from_numpy(l).cuda()
+        self.s = torch.from_numpy(s).cuda()
+        self.b = torch.from_numpy(b).cuda()
+        self.L = torch.from_numpy(L).cuda()
+        self.S = torch.from_numpy(S).cuda()
+        self.I = torch.from_numpy(I).cuda()
+        self.B = torch.from_numpy(B).cuda()
+        self.C = torch.from_numpy(C).cuda()
+        # precompute timesteps
+        if skip_type == "logSNR" or skip_type == "time_uniform" or skip_type == "time_quadratic" or skip_type == "customed_time_karras":
+            self.timesteps = self.get_time_steps(skip_type, t_T=t_T, t_0=t_0, N=steps, device=device)
+            self.indexes = self.convert_to_indexes(self.timesteps)
+            self.timesteps = self.convert_to_timesteps(self.indexes, device)
+        elif skip_type == "edm":
+            self.indexes, self.timesteps = self.get_timesteps_edm(N=steps, device=device)
+            self.timesteps = self.convert_to_timesteps(self.indexes, device)
+        else:
+            raise ValueError(f"Unsupported timestep strategy {skip_type}")
+        print("Indexes", self.indexes)
+        print("Time steps", self.timesteps)
+        print("LogSNR steps", self.noise_schedule.marginal_lambda(self.timesteps))
+        # store high-order exponential coefficients (lazy)
+        self.exp_coeffs = {}
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+    def convert_to_indexes(self, timesteps):
+        logSNR_steps = self.noise_schedule.marginal_lambda(timesteps)
+        indexes = list(
+            (self.statistics_steps * (logSNR_steps - self.lambda_T) / (self.lambda_0 - self.lambda_T))
+            .round()
+            .cpu()
+            .numpy()
+            .astype(np.int64)
+        )
+        return indexes
+    def convert_to_timesteps(self, indexes, device):
+        logSNR_steps = (
+            self.lambda_T + (self.lambda_0 - self.lambda_T) * torch.Tensor(indexes).to(device) / self.statistics_steps
+        )
+        return self.noise_schedule.inverse_lambda(logSNR_steps)
+    def append_zero(self, x):
+        return torch.cat([x, x.new_zeros([1])])
+    def get_sigmas_karras(self, n, sigma_min, sigma_max, rho=7., device='cpu', need_append_zero=True):
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        ramp = torch.linspace(0, 1, n)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return self.append_zero(sigmas).to(device) if need_append_zero else sigmas.to(device)
+    def sigma_to_t(self, sigma, quantize=None):
+        quantize = False
+        log_sigma = sigma.log()
+        dists = log_sigma - self.noise_schedule.log_sigmas[:, None]
+        if quantize:
+            return dists.abs().argmin(dim=0).view(sigma.shape)
+        low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.noise_schedule.log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low, high = self.noise_schedule.log_sigmas[low_idx], self.noise_schedule.log_sigmas[high_idx]
+        w = (low - log_sigma) / (low - high)
+        w = w.clamp(0, 1)
+        t = (1 - w) * low_idx + w * high_idx
+        return t.view(sigma.shape)
+    def get_time_steps(self, skip_type, t_T, t_0, N, device):
+        """Compute the intermediate time steps for sampling.
+        Args:
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps.
+                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            N: A `int`. The total number of the spacing of the time steps.
+            device: A torch device.
+        Returns:
+            A pytorch tensor of the time steps, with the shape (N + 1,).
+        """
+        if skip_type == "logSNR":
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == "time_uniform":
+            return torch.linspace(t_T, t_0, N + 1).to(device)
+        elif skip_type == "time_quadratic":
+            t_order = 2
+            t = torch.linspace(t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1).pow(t_order).to(device)
+            return t
+        elif skip_type == "customed_time_karras":
+            sigma_T = self.noise_schedule.sigmas[-1].cpu().item()
+            sigma_0 = self.noise_schedule.sigmas[0].cpu().item()
+            if N == 8:
+                sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T, rho=7.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[9])
+                ct = self.get_sigmas_karras(9, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 5:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                ct = self.get_sigmas_karras(6, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 6:
+                sigmas = self.sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            none_k_ct = torch.from_numpy(np.array(real_ct)).to(device)
+            return none_k_ct#real_ct
+        else:
+            raise ValueError(
+                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)
+            )
+    def get_timesteps_edm(self, N, device):
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        rho = 7.0  # 7.0 is the value used in the paper
+        sigma_min: float = np.exp(-self.lambda_0)
+        sigma_max: float = np.exp(-self.lambda_T)
+        ramp = np.linspace(0, 1, N + 1)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        lambdas = torch.Tensor(-np.log(sigmas)).to(device)
+        timesteps = self.noise_schedule.inverse_lambda(lambdas)
+        indexes = list(
+            (self.statistics_steps * (lambdas - self.lambda_T) / (self.lambda_0 - self.lambda_T))
+            .round()
+            .cpu()
+            .numpy()
+            .astype(np.int64)
+        )
+        return indexes, timesteps
+    def get_g(self, f_t, i_s, i_t):
+        return torch.exp(self.S[i_s] - self.S[i_t]) * f_t - torch.exp(self.S[i_s]) * (self.B[i_t] - self.B[i_s])
+    def compute_exponential_coefficients_high_order(self, i_s, i_t, order=2):
+        key = (i_s, i_t, order)
+        if key in self.exp_coeffs.keys():
+            coeffs = self.exp_coeffs[key]
+        else:
+            n = order - 1
+            a = self.L[i_s : i_t + 1] + self.S[i_s : i_t + 1] - self.L[i_s] - self.S[i_s]
+            x = self.ts[i_s : i_t + 1]
+            b = (self.ts[i_s : i_t + 1] - self.ts[i_s]) ** n / math.factorial(n)
+            coeffs = weighted_cumsumexp_trapezoid_torch(a, x, b, cumsum=False)
+            self.exp_coeffs[key] = coeffs
+        return coeffs
+    def compute_high_order_derivatives(self, n, lambda_0n, g_0n, pseudo=False):
+        # return g^(1), ..., g^(n)
+        if pseudo:
+            D = [[] for _ in range(n + 1)]
+            D[0] = g_0n
+            for i in range(1, n + 1):
+                for j in range(n - i + 1):
+                    D[i].append((D[i - 1][j] - D[i - 1][j + 1]) / (lambda_0n[j] - lambda_0n[i + j]))
+            return [D[i][0] * math.factorial(i) for i in range(1, n + 1)]
+        else:
+            R = []
+            for i in range(1, n + 1):
+                R.append(torch.pow(lambda_0n[1:] - lambda_0n[0], i))
+            R = torch.stack(R).t()
+            B = (torch.stack(g_0n[1:]) - g_0n[0]).reshape(n, -1)
+            shape = g_0n[0].shape
+            solution = torch.linalg.inv(R) @ B
+            solution = solution.reshape([n] + list(shape))
+            return [solution[i - 1] * math.factorial(i) for i in range(1, n + 1)]
+    def multistep_predictor_update(self, x_lst, eps_lst, time_lst, index_lst, t, i_t, order=1, pseudo=False):
+        # x_lst: [..., x_s]
+        # eps_lst: [..., eps_s]
+        # time_lst: [..., time_s]
+        ns = self.noise_schedule
+        n = order - 1
+        indexes = [-i - 1 for i in range(n + 1)]
+        x_0n = index_list(x_lst, indexes)
+        eps_0n = index_list(eps_lst, indexes)
+        time_0n = torch.FloatTensor(index_list(time_lst, indexes)).cuda()
+        index_0n = index_list(index_lst, indexes)
+        lambda_0n = ns.marginal_lambda(time_0n)
+        alpha_0n = ns.marginal_alpha(time_0n)
+        sigma_0n = ns.marginal_std(time_0n)
+        alpha_s, alpha_t = alpha_0n[0], ns.marginal_alpha(t)
+        i_s = index_0n[0]
+        x_s = x_0n[0]
+        g_0n = []
+        for i in range(n + 1):
+            f_i = (sigma_0n[i] * eps_0n[i] - self.l[index_0n[i]] * x_0n[i]) / alpha_0n[i]
+            g_i = self.get_g(f_i, index_0n[0], index_0n[i])
+            g_0n.append(g_i)
+        g_0 = g_0n[0]
+        x_t = (
+            alpha_t / alpha_s * torch.exp(self.L[i_s] - self.L[i_t]) * x_s
+            - alpha_t * torch.exp(-self.L[i_t] - self.S[i_s]) * (self.I[i_t] - self.I[i_s]) * g_0
+            - alpha_t
+            * torch.exp(-self.L[i_t])
+            * (self.C[i_t] - self.C[i_s] - self.B[i_s] * (self.I[i_t] - self.I[i_s]))
+        )
+        if order > 1:
+            g_d = self.compute_high_order_derivatives(n, lambda_0n, g_0n, pseudo=pseudo)
+            for i in range(order - 1):
+                x_t = (
+                    x_t
+                    - alpha_t
+                    * torch.exp(self.L[i_s] - self.L[i_t])
+                    * self.compute_exponential_coefficients_high_order(i_s, i_t, order=i + 2)
+                    * g_d[i]
+                )
+        return x_t
+    def multistep_corrector_update(self, x_lst, eps_lst, time_lst, index_lst, order=1, pseudo=False):
+        # x_lst: [..., x_s, x_t]
+        # eps_lst: [..., eps_s, eps_t]
+        # lambda_lst: [..., lambda_s, lambda_t]
+        ns = self.noise_schedule
+        n = order - 1
+        indexes = [-i - 1 for i in range(n + 1)]
+        indexes[0] = -2
+        indexes[1] = -1
+        x_0n = index_list(x_lst, indexes)
+        eps_0n = index_list(eps_lst, indexes)
+        time_0n = torch.FloatTensor(index_list(time_lst, indexes)).cuda()
+        index_0n = index_list(index_lst, indexes)
+        lambda_0n = ns.marginal_lambda(time_0n)
+        alpha_0n = ns.marginal_alpha(time_0n)
+        sigma_0n = ns.marginal_std(time_0n)
+        alpha_s, alpha_t = alpha_0n[0], alpha_0n[1]
+        i_s, i_t = index_0n[0], index_0n[1]
+        x_s = x_0n[0]
+        g_0n = []
+        for i in range(n + 1):
+            f_i = (sigma_0n[i] * eps_0n[i] - self.l[index_0n[i]] * x_0n[i]) / alpha_0n[i]
+            g_i = self.get_g(f_i, index_0n[0], index_0n[i])
+            g_0n.append(g_i)
+        g_0 = g_0n[0]
+        x_t_new = (
+            alpha_t / alpha_s * torch.exp(self.L[i_s] - self.L[i_t]) * x_s
+            - alpha_t * torch.exp(-self.L[i_t] - self.S[i_s]) * (self.I[i_t] - self.I[i_s]) * g_0
+            - alpha_t
+            * torch.exp(-self.L[i_t])
+            * (self.C[i_t] - self.C[i_s] - self.B[i_s] * (self.I[i_t] - self.I[i_s]))
+        )
+        if order > 1:
+            g_d = self.compute_high_order_derivatives(n, lambda_0n, g_0n, pseudo=pseudo)
+            for i in range(order - 1):
+                x_t_new = (
+                    x_t_new
+                    - alpha_t
+                    * torch.exp(self.L[i_s] - self.L[i_t])
+                    * self.compute_exponential_coefficients_high_order(i_s, i_t, order=i + 2)
+                    * g_d[i]
+                )
+        return x_t_new
+    def sample(
+        self,
+        x,
+        model_fn,
+        order,
+        p_pseudo,
+        use_corrector,
+        c_pseudo,
+        lower_order_final,
+        start_free_u_step=None,
+        free_u_apply_callback=None,
+        free_u_stop_callback=None,
+        half=False,
+        return_intermediate=False,
+    ):
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        steps = self.steps
+        cached_x = []
+        cached_model_output = []
+        cached_time = []
+        cached_index = []
+        indexes, timesteps = self.indexes, self.timesteps
+        step_p_order = 0
+        if free_u_stop_callback is not None:
+            free_u_stop_callback()
+        for step in range(1, steps + 1):
+            if start_free_u_step is not None and step == start_free_u_step and free_u_apply_callback is not None:
+                free_u_apply_callback()
+            cached_x.append(x)
+            cached_model_output.append(self.noise_prediction_fn(x, timesteps[step - 1]))
+            cached_time.append(timesteps[step - 1])
+            cached_index.append(indexes[step - 1])
+            if use_corrector and (timesteps[step - 1] > 0.5 or not half):
+                step_c_order = step_p_order + c_pseudo
+                if step_c_order > 1:
+                    x_new = self.multistep_corrector_update(
+                        cached_x, cached_model_output, cached_time, cached_index, order=step_c_order, pseudo=c_pseudo
+                    )
+                    sigma_t = self.noise_schedule.marginal_std(cached_time[-1])
+                    l_t = self.l[cached_index[-1]]
+                    N_old = sigma_t * cached_model_output[-1] - l_t * cached_x[-1]
+                    cached_x[-1] = x_new
+                    cached_model_output[-1] = (N_old + l_t * cached_x[-1]) / sigma_t
+            if step < order:
+                step_p_order = step
+            else:
+                step_p_order = order
+            if lower_order_final:
+                step_p_order = min(step_p_order, steps + 1 - step)
+            t = timesteps[step]
+            i_t = indexes[step]
+            x = self.multistep_predictor_update(
+                cached_x, cached_model_output, cached_time, cached_index, t, i_t, order=step_p_order, pseudo=p_pseudo
+            )
+        if return_intermediate:
+            return x, cached_x
+        else:
+            return x
+#############################################################
+# other utility functions
+#############################################################
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K),
+            torch.tensor(K - 2, device=x.device),
+            cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,) * (dims - 1)]

free_lunch_utils.py ADDED Viewed

	@@ -0,0 +1,303 @@

+import torch
+import torch.fft as fft
+from diffusers.utils import is_torch_version
+from typing import Any, Dict, List, Optional, Tuple, Union
+def isinstance_str(x: object, cls_name: str):
+    """
+    Checks whether x has any class *named* cls_name in its ancestry.
+    Doesn't require access to the class's implementation.
+    Useful for patching!
+    """
+    for _cls in x.__class__.__mro__:
+        if _cls.__name__ == cls_name:
+            return True
+    return False
+def Fourier_filter(x, threshold, scale):
+    dtype = x.dtype
+    x = x.type(torch.float32)
+    # FFT
+    x_freq = fft.fftn(x, dim=(-2, -1))
+    x_freq = fft.fftshift(x_freq, dim=(-2, -1))
+    B, C, H, W = x_freq.shape
+    mask = torch.ones((B, C, H, W)).cuda()
+    crow, ccol = H // 2, W //2
+    mask[..., crow - threshold:crow + threshold, ccol - threshold:ccol + threshold] = scale
+    x_freq = x_freq * mask
+    # IFFT
+    x_freq = fft.ifftshift(x_freq, dim=(-2, -1))
+    x_filtered = fft.ifftn(x_freq, dim=(-2, -1)).real
+    x_filtered = x_filtered.type(dtype)
+    return x_filtered
+def register_upblock2d(model):
+    def up_forward(self):
+        def forward(hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                #print(f"in upblock2d, hidden states shape: {hidden_states.shape}")
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    if is_torch_version(">=", "1.11.0"):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                        )
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb
+                        )
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "UpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+def register_free_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2):
+    def up_forward(self):
+        def forward(hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+            for resnet in self.resnets:
+                # pop res hidden states
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                #print(f"in free upblock2d, hidden states shape: {hidden_states.shape}")
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    hidden_states[:,:640] = hidden_states[:,:640] * self.b1
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    hidden_states[:,:320] = hidden_states[:,:320] * self.b2
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs)
+                        return custom_forward
+                    if is_torch_version(">=", "1.11.0"):
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb, use_reentrant=False
+                        )
+                    else:
+                        hidden_states = torch.utils.checkpoint.checkpoint(
+                            create_custom_forward(resnet), hidden_states, temb
+                        )
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "UpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)
+def register_crossattn_upblock2d(model):
+    def up_forward(self):
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                #print(f"in crossatten upblock2d, hidden states shape: {hidden_states.shape}")
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                        attention_mask=attention_mask,
+                        encoder_attention_mask=encoder_attention_mask,
+                        return_dict=False,
+                    )[0]
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "CrossAttnUpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+def register_free_crossattn_upblock2d(model, b1=1.2, b2=1.4, s1=0.9, s2=0.2):
+    def up_forward(self):
+        def forward(
+            hidden_states: torch.FloatTensor,
+            res_hidden_states_tuple: Tuple[torch.FloatTensor, ...],
+            temb: Optional[torch.FloatTensor] = None,
+            encoder_hidden_states: Optional[torch.FloatTensor] = None,
+            cross_attention_kwargs: Optional[Dict[str, Any]] = None,
+            upsample_size: Optional[int] = None,
+            attention_mask: Optional[torch.FloatTensor] = None,
+            encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        ):
+            for resnet, attn in zip(self.resnets, self.attentions):
+                # pop res hidden states
+                #print(f"in free crossatten upblock2d, hidden states shape: {hidden_states.shape}")
+                res_hidden_states = res_hidden_states_tuple[-1]
+                res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+                # --------------- FreeU code -----------------------
+                # Only operate on the first two stages
+                if hidden_states.shape[1] == 1280:
+                    hidden_states[:,:640] = hidden_states[:,:640] * self.b1
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s1)
+                if hidden_states.shape[1] == 640:
+                    hidden_states[:,:320] = hidden_states[:,:320] * self.b2
+                    res_hidden_states = Fourier_filter(res_hidden_states, threshold=1, scale=self.s2)
+                # ---------------------------------------------------------
+                hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+                if self.training and self.gradient_checkpointing:
+                    def create_custom_forward(module, return_dict=None):
+                        def custom_forward(*inputs):
+                            if return_dict is not None:
+                                return module(*inputs, return_dict=return_dict)
+                            else:
+                                return module(*inputs)
+                        return custom_forward
+                    ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(resnet),
+                        hidden_states,
+                        temb,
+                        **ckpt_kwargs,
+                    )
+                    hidden_states = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(attn, return_dict=False),
+                        hidden_states,
+                        encoder_hidden_states,
+                        None,  # timestep
+                        None,  # class_labels
+                        cross_attention_kwargs,
+                        attention_mask,
+                        encoder_attention_mask,
+                        **ckpt_kwargs,
+                    )[0]
+                else:
+                    hidden_states = resnet(hidden_states, temb)
+                    # hidden_states = attn(
+                    #     hidden_states,
+                    #     encoder_hidden_states=encoder_hidden_states,
+                    #     cross_attention_kwargs=cross_attention_kwargs,
+                    #     encoder_attention_mask=encoder_attention_mask,
+                    #     return_dict=False,
+                    # )[0]
+                    hidden_states = attn(
+                        hidden_states,
+                        encoder_hidden_states=encoder_hidden_states,
+                        cross_attention_kwargs=cross_attention_kwargs,
+                    )[0]
+            if self.upsamplers is not None:
+                for upsampler in self.upsamplers:
+                    hidden_states = upsampler(hidden_states, upsample_size)
+            return hidden_states
+        return forward
+    for i, upsample_block in enumerate(model.unet.up_blocks):
+        if isinstance_str(upsample_block, "CrossAttnUpBlock2D"):
+            upsample_block.forward = up_forward(upsample_block)
+            setattr(upsample_block, 'b1', b1)
+            setattr(upsample_block, 'b2', b2)
+            setattr(upsample_block, 's1', s1)
+            setattr(upsample_block, 's2', s2)

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+tqdm
+einops
+pytorch_lightning
+accelerate
+torchsde
+pycocotools
+diffusers
+timm
+transformers
+opencv-python
+omegaconf

sampler.py ADDED Viewed

	@@ -0,0 +1,315 @@

+"""SAMPLING ONLY."""
+import torch
+from dpm_solver_v3 import NoiseScheduleVP, model_wrapper, DPM_Solver_v3
+from uni_pc import UniPC
+from free_lunch_utils import register_free_upblock2d, register_free_crossattn_upblock2d
+class DPMSolverv3Sampler:
+    def __init__(self, stats_dir, pipe, steps, guidance_scale, **kwargs):
+        super().__init__()
+        self.model = pipe
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(pipe.device)
+        DTYPE = torch.float32  # torch.float16 works as well, but pictures seem to be a bit worse
+        device = "cuda"
+        noise_scheduler = pipe.scheduler
+        alpha_schedule = noise_scheduler.alphas_cumprod.to(device=device, dtype=DTYPE)
+        self.alphas_cumprod = alpha_schedule #to_torch(model.alphas_cumprod)
+        self.device = device
+        self.guidance_scale = guidance_scale
+        self.ns = NoiseScheduleVP("discrete", alphas_cumprod=self.alphas_cumprod)
+        assert stats_dir is not None, f"No statistics file found in {stats_dir}."
+        print("Use statistics", stats_dir)
+        self.dpm_solver_v3 = DPM_Solver_v3(
+            statistics_dir=stats_dir,
+            noise_schedule=self.ns,
+            steps=steps,
+            t_start=None,
+            t_end=None,
+            skip_type="customed_time_karras",
+            degenerated=False,
+            device=self.device,
+        )
+        self.steps = steps
+    @torch.no_grad()
+    def apply_free_unet(self):
+        register_free_upblock2d(self.model, b1=1.1, b2=1.1, s1=0.9, s2=0.2)
+        register_free_crossattn_upblock2d(self.model, b1=1.1, b2=1.1, s1=0.9, s2=0.2)
+    @torch.no_grad()
+    def stop_free_unet(self):
+        register_free_upblock2d(self.model, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+        register_free_crossattn_upblock2d(self.model, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+    @torch.no_grad()
+    def sample(
+        self,
+        batch_size,
+        shape,
+        conditioning=None,
+        x_T=None,
+        unconditional_conditioning=None,
+        use_corrector=False,
+        half=False,
+        start_free_u_step=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        if conditioning is not None:
+            cond_in = torch.cat([unconditional_conditioning, conditioning])
+            # extra_args = {'cond': conditioning, 'uncond': unconditional_conditioning, 'cond_scale': self.guidance_scale}
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        if x_T is None:
+            img = torch.randn(size, device=self.device)
+        else:
+            img = x_T
+        if conditioning is None:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model.unet(x, t, encoder_hidden_states=c).sample,
+                self.ns,
+                model_type="noise",
+                guidance_type="uncond",
+            )
+            ORDER = 3
+        else:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model.unet(x, t, encoder_hidden_states=c).sample,
+                self.ns,
+                model_type="noise",
+                guidance_type="classifier-free",
+                condition=conditioning,
+                unconditional_condition=unconditional_conditioning,
+                guidance_scale=self.guidance_scale,
+            )
+            if self.steps == 8:
+                ORDER = 2
+            else:
+                ORDER = 1
+        x = self.dpm_solver_v3.sample(
+            img,
+            model_fn,
+            order=ORDER,
+            p_pseudo=False,
+            c_pseudo=True,
+            lower_order_final=True,
+            use_corrector=use_corrector,
+            start_free_u_step=start_free_u_step,
+            free_u_apply_callback=self.apply_free_unet if start_free_u_step is not None else None,
+            free_u_stop_callback=self.stop_free_unet if start_free_u_step is not None else None,
+            half=half,
+        )
+        return x.to(self.device), None
+class UniPCSampler:
+    def __init__(self
+                 , pipe
+                 , model_closure
+                 , steps
+                 , guidance_scale,denoise_to_zero=False
+                 , need_fp16_discrete_method = False
+                 , ultilize_vae_in_fp16 = False
+                 , is_high_resoulution = True
+                 , skip_type="customed_time_karras"
+                 , force_not_use_afs=False
+                 , **kwargs):
+        super().__init__()
+        # self.model = pipe
+        self.model = model_closure(pipe)
+        self.pipe = pipe
+        self.need_fp16_discrete_method = need_fp16_discrete_method
+        # to_torch = lambda x: x.clone().detach().to(torch.float32).to(pipe.device)
+        DTYPE = self.pipe.unet.dtype  # torch.float16 works as well, but pictures seem to be a bit worse
+        device = self.pipe.device
+        noise_scheduler = pipe.scheduler
+        alpha_schedule = noise_scheduler.alphas_cumprod.to(device=device, dtype=DTYPE)
+        self.alphas_cumprod = alpha_schedule #to_torch(model.alphas_cumprod)
+        self.device = device
+        self.guidance_scale = guidance_scale
+        self.use_afs = steps <= 8 and is_high_resoulution and not force_not_use_afs
+        self.ns = NoiseScheduleVP("discrete", alphas_cumprod=self.alphas_cumprod)
+        self.unipc_solver = UniPC(
+            noise_schedule=self.ns,
+            steps=steps,
+            t_start=None,
+            t_end=None,
+            skip_type=skip_type,
+            degenerated=False,
+            use_afs=self.use_afs,
+            device=self.device,
+            denoise_to_zero=denoise_to_zero,
+            need_fp16_discrete_method = self.need_fp16_discrete_method,
+            ultilize_vae_in_fp16 = ultilize_vae_in_fp16,
+            is_high_resoulution=is_high_resoulution,
+        )
+        self.steps = steps
+    @torch.no_grad()
+    def apply_free_unet(self):
+        register_free_upblock2d(self.pipe, b1=1.2, b2=1.2, s1=0.9, s2=0.2)
+        register_free_crossattn_upblock2d(self.pipe, b1=1.2, b2=1.2, s1=0.9, s2=0.2)
+    @torch.no_grad()
+    def stop_free_unet(self):
+        register_free_upblock2d(self.pipe, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+        register_free_crossattn_upblock2d(self.pipe, b1=1.0, b2=1.0, s1=1.0, s2=1.0)
+    @torch.no_grad()
+    def sample(
+        self,
+        batch_size,
+        shape,
+        conditioning=None,
+        x_T=None,
+        unconditional_conditioning=None,
+        use_corrector=False,
+        half=False,
+        start_free_u_step=None,
+        xl_preprocess_closure=None,
+        npnet=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        new_img = None
+        if xl_preprocess_closure is not None:
+            prompt_embeds, cond_kwargs = xl_preprocess_closure(pipe=self.pipe,prompts = conditioning, need_cfg=True, device=self.device,negative_prompts=unconditional_conditioning)
+        if x_T is None:
+            img = torch.randn(size, device=self.device)
+        else:
+            img = x_T
+        if xl_preprocess_closure is not None and npnet is not None:
+            c, _ = prompt_embeds
+            c = c.unsqueeze(0)  # add dummy dimension for npnet
+            new_img = npnet(img, c)
+        if conditioning is None:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model(x, t, c),
+                self.ns,
+                model_type="noise",
+                guidance_type="uncond",
+            )
+            ORDER = 3
+        else:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model(x, t, c),
+                self.ns,
+                model_type="noise",
+                guidance_type="classifier-free",
+                condition=conditioning if xl_preprocess_closure is None else prompt_embeds,
+                unconditional_condition=unconditional_conditioning if xl_preprocess_closure is None else cond_kwargs,
+                guidance_scale=self.guidance_scale,
+            )
+            if self.steps >= 7:
+                ORDER = 2
+            else:
+                ORDER = 1
+        x, full_cache = self.unipc_solver.sample(
+            x=img,
+            model_fn=model_fn,
+            order=ORDER,
+            use_corrector=use_corrector,
+            lower_order_final=True,
+            start_free_u_step=start_free_u_step,
+            free_u_apply_callback=self.apply_free_unet if start_free_u_step is not None else None,
+            free_u_stop_callback=self.stop_free_unet if start_free_u_step is not None else None,
+            npnet_x=new_img if new_img is not None else None,
+            npnet_scale=self.guidance_scale if new_img is not None else None,
+            half=half,
+        )
+        return x.to(self.device), full_cache
+    @torch.no_grad()
+    def sample_mix(
+        self,
+        batch_size,
+        shape,
+        conditioning=None,
+        x_T=None,
+        unconditional_conditioning=None,
+        use_corrector=False,
+        half=False,
+        start_free_u_step=None,
+        xl_preprocess_closure=None,
+        npnet=None,
+        # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+        **kwargs,
+    ):
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        if xl_preprocess_closure is not None:
+            prompt_embeds, cond_kwargs = xl_preprocess_closure(pipe=self.pipe,prompts = conditioning, need_cfg=True, device=self.device,negative_prompts=unconditional_conditioning)
+        if x_T is None:
+            img = torch.randn(size, device=self.device)
+        else:
+            img = x_T
+        if xl_preprocess_closure is not None and npnet is not None:
+            c, _ = prompt_embeds
+            c = c.unsqueeze(0)  # add dummy dimension for npnet
+            img = npnet(img, c)
+        if conditioning is None:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model(x, t, c),
+                self.ns,
+                model_type="noise",
+                guidance_type="uncond",
+            )
+            ORDER = 3
+        else:
+            model_fn = model_wrapper(
+                lambda x, t, c: self.model(x, t, c),
+                self.ns,
+                model_type="noise",
+                guidance_type="classifier-free",
+                condition=conditioning if xl_preprocess_closure is None else prompt_embeds,
+                unconditional_condition=unconditional_conditioning if xl_preprocess_closure is None else cond_kwargs,
+                guidance_scale=self.guidance_scale,
+            )
+            if self.steps >= 8 and not self.need_fp16_discrete_method:
+                ORDER = 2
+            else:
+                ORDER = 1
+        x, full_cache = self.unipc_solver.sample_mix(
+            x=img,
+            model_fn=model_fn,
+            order=ORDER,
+            use_corrector=use_corrector,
+            lower_order_final=True,
+            start_free_u_step=start_free_u_step,
+            free_u_apply_callback=self.apply_free_unet if start_free_u_step is not None else None,
+            free_u_stop_callback=self.stop_free_unet if start_free_u_step is not None else None,
+            half=half,
+        )
+        return x.to(self.device), full_cache

uni_pc.py ADDED Viewed

	@@ -0,0 +1,757 @@

+from dpm_solver_v3 import NoiseScheduleVP, model_wrapper
+import torch
+import torch.nn.functional as F
+import math
+import numpy as np
+import os
+class UniPC:
+    def __init__(
+        self,
+        noise_schedule,
+        steps=10,
+        t_start=None,
+        t_end=None,
+        skip_type="customed_time_karras",
+        degenerated=False,
+        use_afs = False,
+        denoise_to_zero=False,
+        need_fp16_discrete_method = False,
+        ultilize_vae_in_fp16 = False,
+        is_high_resoulution = True,
+        device="cuda",
+    ):
+        self.device = device
+        self.model = None
+        self.noise_schedule = noise_schedule
+        self.steps = steps if not use_afs else steps + 1
+        self.use_afs = use_afs
+        self.ultilize_vae_in_fp16 = ultilize_vae_in_fp16
+        self.need_fp16_discrete_method = need_fp16_discrete_method
+        t_0 = 1.0 / self.noise_schedule.total_N if t_end is None else t_end
+        t_T = self.noise_schedule.T if t_start is None else t_start
+        self.is_high_resolution = is_high_resoulution
+        assert (
+            t_0 > 0 and t_T > 0
+        ), "Time range needs to be greater than 0. For discrete-time DPMs, it needs to be in [1 / N, 1], where N is the length of betas array"
+        # precompute timesteps
+        if skip_type == "logSNR" or skip_type == "time_uniform" or skip_type == "time_quadratic" or skip_type == "customed_time_karras":
+            self.timesteps = self.get_time_steps(skip_type
+                                                 , t_T=t_T
+                                                 , t_0=t_0
+                                                 , N=steps
+                                                 , device=device,denoise_to_zero=denoise_to_zero
+                                                 , is_high_resolution=self.is_high_resolution)
+        else:
+            raise ValueError(f"Unsupported timestep strategy {skip_type}")
+        self.lambda_T = self.timesteps[0].cpu().item()
+        self.lambda_0 = self.timesteps[-1].cpu().item()
+        # print("Time steps", self.timesteps)
+        # print("LogSNR steps", self.noise_schedule.marginal_lambda(self.timesteps))
+        # store high-order exponential coefficients (lazy)
+        self.exp_coeffs = {}
+    def noise_prediction_fn(self, x, t):
+        """
+        Return the noise prediction model.
+        """
+        return self.model(x, t)
+    def append_zero(self, x):
+        return torch.cat([x, x.new_zeros([1])])
+    def get_sigmas_karras(self, n, sigma_min, sigma_max, rho=7., device='cpu', need_append_zero=True):
+        """Constructs the noise schedule of Karras et al. (2022)."""
+        ramp = torch.linspace(0, 1, n)
+        min_inv_rho = sigma_min ** (1 / rho)
+        max_inv_rho = sigma_max ** (1 / rho)
+        sigmas = (max_inv_rho + ramp * (min_inv_rho - max_inv_rho)) ** rho
+        return self.append_zero(sigmas).to(device) if need_append_zero else sigmas.to(device)
+    def sigma_to_t(self, sigma, quantize=None):
+        quantize = False
+        log_sigma = sigma.log()
+        dists = log_sigma - self.noise_schedule.log_sigmas[:, None]
+        if quantize:
+            return dists.abs().argmin(dim=0).view(sigma.shape)
+        low_idx = dists.ge(0).cumsum(dim=0).argmax(dim=0).clamp(max=self.noise_schedule.log_sigmas.shape[0] - 2)
+        high_idx = low_idx + 1
+        low, high = self.noise_schedule.log_sigmas[low_idx], self.noise_schedule.log_sigmas[high_idx]
+        w = (low - log_sigma) / (low - high)
+        w = w.clamp(0, 1)
+        t = (1 - w) * low_idx + w * high_idx
+        return t.view(sigma.shape)
+    def get_time_steps(self, skip_type, t_T, t_0, N, device, denoise_to_zero=False, is_high_resolution=True):
+        """Compute the intermediate time steps for sampling.
+        Args:
+            skip_type: A `str`. The type for the spacing of the time steps. We support three types:
+                - 'logSNR': uniform logSNR for the time steps.
+                - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.)
+                - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.)
+            t_T: A `float`. The starting time of the sampling (default is T).
+            t_0: A `float`. The ending time of the sampling (default is epsilon).
+            N: A `int`. The total number of the spacing of the time steps.
+            device: A torch device.
+        Returns:
+            A pytorch tensor of the time steps, with the shape (N + 1,).
+        """
+        if skip_type == "logSNR":
+            lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device))
+            lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device))
+            logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device)
+            return self.noise_schedule.inverse_lambda(logSNR_steps)
+        elif skip_type == "time_uniform":
+            return torch.linspace(t_T, t_0, N + 1).to(device)
+        elif skip_type == "time_quadratic":
+            t_order = 2
+            t = torch.linspace(t_T ** (1.0 / t_order), t_0 ** (1.0 / t_order), N + 1).pow(t_order).to(device)
+            return t
+        elif skip_type == "customed_time_karras" and is_high_resolution:
+            sigma_T = self.noise_schedule.sigmas[-1].cpu().item()
+            sigma_0 = self.noise_schedule.sigmas[0].cpu().item()
+            if N == 8:
+                sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T,rho=12.0, device=device)
+                if not self.need_fp16_discrete_method:
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[10])
+                    ct = self.get_sigmas_karras(9, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                else:
+                    sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                    ct = self.get_sigmas_karras(8, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    tmp_t = [self.noise_schedule.sigma_to_t(sigma).to('cpu') for sigma in sigmas_ct]
+                    real_ct = [ t / 999 for t in tmp_t]
+            elif N == 5:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                if not self.need_fp16_discrete_method:
+                    sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T,rho=12.0, device=device)
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[9])
+                    ct = self.get_sigmas_karras(6, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                else:
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                    ct = self.get_sigmas_karras(5, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 6:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                if not self.need_fp16_discrete_method:
+                    sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T,rho=12.0, device=device)
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[10])
+                    ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                else:
+                    if denoise_to_zero:
+                        ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                        ct = self.get_sigmas_karras(6, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                        sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                        real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                        real_ct.append(torch.tensor(t_0).to(dtype=real_ct[-1].dtype,device='cpu'))
+                    else:
+                        sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T, rho=7.0, device=device)
+                        ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[7])
+                        ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                        sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                        real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 7:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                if not self.need_fp16_discrete_method:
+                    ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                    ct = self.get_sigmas_karras(8, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                    sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                    real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                else:
+                    if denoise_to_zero:
+                        ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                        ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                        sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                        real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+                        real_ct.append(torch.tensor(t_0).to(dtype=real_ct[-1].dtype,device='cpu'))
+            # if denoise_to_zero:
+            #     real_ct.append(torch.tensor(t_0).to(dtype=real_ct[-1].dtype,device='cpu'))
+            if self.use_afs:
+                tmp_t = (real_ct[0] + real_ct[1]) / 2
+                real_ct.insert(1, tmp_t)
+            none_k_ct = torch.from_numpy(np.array(real_ct)).to(device)
+            return none_k_ct#real_ct
+        elif skip_type == "customed_time_karras" and not is_high_resolution:
+            sigma_T = self.noise_schedule.sigmas[-1].cpu().item()
+            sigma_0 = self.noise_schedule.sigmas[0].cpu().item()
+            if N == 8:
+                sigmas = self.get_sigmas_karras(12, sigma_0, sigma_T, rho=7.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[9])
+                ct = self.get_sigmas_karras(9, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 5:
+                sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                ct = self.get_sigmas_karras(6, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            elif N == 6:
+                sigmas = self.sigmas = self.get_sigmas_karras(8, sigma_0, sigma_T, rho=5.0, device=device)
+                ct_start, ct_end = self.noise_schedule.sigma_to_t(sigmas[0]), self.sigma_to_t(sigmas[6])
+                ct = self.get_sigmas_karras(7, ct_end.item(), ct_start.item(),rho=1.2, device='cpu',need_append_zero=False).numpy()
+                sigmas_ct = self.noise_schedule.get_special_sigmas_with_timesteps(ct).to(device=device)
+                real_ct = [self.noise_schedule.sigma_to_t(sigma).to('cpu') / 999 for sigma in sigmas_ct]
+            none_k_ct = torch.from_numpy(np.array(real_ct)).to(device)
+            return none_k_ct#real_ct
+        else:
+            raise ValueError(
+                "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)
+            )
+    def multistep_uni_pc_update(self, x, model_prev_list:list, t_prev_list: list, t, order, **kwargs):
+        if len(model_prev_list) == 0 or len(t_prev_list) == 0:
+            return None, None
+        if len(t.shape) == 0:
+            t = t.view(-1)
+        if True:#'bh' in self.variant:
+            return self.multistep_uni_pc_bh_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
+        else:
+            # assert self.variant == 'vary_coeff'
+            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
+    def multistep_uni_pc_sde_update(self, x, model_prev_list:list, t_prev_list: list, t, order, level = 1.0, **kwargs):
+        if len(model_prev_list) == 0 or len(t_prev_list) == 0:
+            return None, None
+        if len(t.shape) == 0:
+            t = t.view(-1)
+        if True:#'bh' in self.variant:
+            return self.multistep_uni_pc_bh_sde_update(x, model_prev_list, t_prev_list, t, level=level, order= order, **kwargs)
+        else:
+            # assert self.variant == 'vary_coeff'
+            return self.multistep_uni_pc_vary_update(x, model_prev_list, t_prev_list, t, order, **kwargs)
+    def multistep_uni_pc_bh_update(self, x, model_prev_list, t_prev_list, t, order, x_t=None, use_corrector=True):
+        # print(f'using unified predictor-corrector with order {order} (solver type: B(h))')
+        ns = self.noise_schedule
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        # first compute rks
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
+        lambda_t = ns.marginal_lambda(t)
+        model_prev_0 = model_prev_list[-1]
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h = lambda_t - lambda_prev_0
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = ns.marginal_lambda(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        R = []
+        b = []
+        hh = h[0]
+        h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if True:
+            B_h = hh
+        else:
+            B_h = torch.expm1(hh)
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= (i + 1)
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=x.device)
+        # now predictor
+        use_predictor = len(D1s) > 0 and x_t is None
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1) # (B, K)
+            if x_t is None:
+                # for order 2, we use a simplified version
+                if order == 2:
+                    rhos_p = torch.tensor([0.5], device=b.device)
+                else:
+                    rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+        if use_corrector:
+            # print('using corrector')
+            # for order 1, we use a simplified version
+            if order == 1:
+                rhos_c = torch.tensor([0.5], device=b.device)
+            else:
+                rhos_c = torch.linalg.solve(R, b)
+        model_t = None
+        x_t_ = (
+            expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+            - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
+        )
+        if x_t is None:
+            if use_predictor:
+                pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * pred_res
+        if use_corrector:
+            model_t = self.noise_prediction_fn(x_t, t)
+            if D1s is not None:
+                corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = (model_t - model_prev_0)
+            x_t = x_t_ - expand_dims(sigma_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t)
+        return x_t, model_t
+    def multistep_uni_pc_bh_sde_update(self, x, model_prev_list, t_prev_list, t, order, level = 0, x_t=None, use_corrector=True):
+        # print(f'using unified predictor-corrector with order {order} (solver type: B(h))')
+        ns = self.noise_schedule
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        # first compute rks
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
+        lambda_t = ns.marginal_lambda(t)
+        model_prev_0 = model_prev_list[-1]
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h = lambda_t - lambda_prev_0
+        z = torch.randn(x.shape, device=self.device)
+        z = sigma_t * torch.sqrt(torch.expm1(2.0 * h[0])) * z
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = ns.marginal_lambda(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        R = []
+        b = []
+        hh = h[0]
+        h_phi_1 = torch.expm1(hh) # h\phi_1(h) = e^h - 1
+        h_phi_k = h_phi_1 / hh - 1
+        factorial_i = 1
+        if True:
+            B_h = hh
+        else:
+            B_h = torch.expm1(hh)
+        for i in range(1, order + 1):
+            R.append(torch.pow(rks, i - 1))
+            b.append(h_phi_k * factorial_i / B_h)
+            factorial_i *= (i + 1)
+            h_phi_k = h_phi_k / hh - 1 / factorial_i
+        R = torch.stack(R)
+        b = torch.tensor(b, device=x.device)
+        # now predictor
+        use_predictor = len(D1s) > 0 and x_t is None
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1) # (B, K)
+            if x_t is None:
+                # for order 2, we use a simplified version
+                if order == 2:
+                    rhos_p = torch.tensor([0.5], device=b.device)
+                else:
+                    rhos_p = torch.linalg.solve(R[:-1, :-1], b[:-1])
+        else:
+            D1s = None
+        if use_corrector:
+            # print('using corrector')
+            # for order 1, we use a simplified version
+            if order == 1:
+                rhos_c = torch.tensor([0.5], device=b.device)
+            else:
+                rhos_c = torch.linalg.solve(R, b)
+        model_t = None
+        x_t_ = (
+            expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+            - expand_dims(sigma_t * h_phi_1, dims) * (1 + level) * model_prev_0
+        )
+        if x_t is None:
+            if use_predictor:
+                pred_res = torch.einsum('k,bkchw->bchw', rhos_p, D1s)
+            else:
+                pred_res = 0
+            x_t_p = (
+                expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x
+                - expand_dims(sigma_t * h_phi_1, dims) * model_prev_0
+            )
+            x_t = x_t_p - expand_dims(sigma_t * B_h, dims) * pred_res
+        if use_corrector:
+            model_t = self.noise_prediction_fn(x_t, t)
+            if D1s is not None:
+                corr_res = torch.einsum('k,bkchw->bchw', rhos_c[:-1], D1s)
+            else:
+                corr_res = 0
+            D1_t = (model_t - model_prev_0)
+            x_t = x_t_ - (1 + level)  * expand_dims(sigma_t * B_h, dims) * (corr_res + rhos_c[-1] * D1_t) + z * level
+        return x_t, model_t
+    def multistep_uni_pc_vary_update(self, x, model_prev_list, t_prev_list, t, order, use_corrector=True):
+        # print(f'using unified predictor-corrector with order {order} (solver type: vary coeff)')
+        ns = self.noise_schedule
+        assert order <= len(model_prev_list)
+        dims = x.dim()
+        # first compute rks
+        t_prev_0 = t_prev_list[-1]
+        lambda_prev_0 = ns.marginal_lambda(t_prev_0)
+        lambda_t = ns.marginal_lambda(t)
+        model_prev_0 = model_prev_list[-1]
+        sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t)
+        log_alpha_t = ns.marginal_log_mean_coeff(t)
+        alpha_t = torch.exp(log_alpha_t)
+        h = lambda_t - lambda_prev_0
+        rks = []
+        D1s = []
+        for i in range(1, order):
+            t_prev_i = t_prev_list[-(i + 1)]
+            model_prev_i = model_prev_list[-(i + 1)]
+            lambda_prev_i = ns.marginal_lambda(t_prev_i)
+            rk = ((lambda_prev_i - lambda_prev_0) / h)[0]
+            rks.append(rk)
+            D1s.append((model_prev_i - model_prev_0) / rk)
+        rks.append(1.)
+        rks = torch.tensor(rks, device=x.device)
+        K = len(rks)
+        # build C matrix
+        C = []
+        col = torch.ones_like(rks)
+        for k in range(1, K + 1):
+            C.append(col)
+            col = col * rks / (k + 1)
+        C = torch.stack(C, dim=1)
+        if len(D1s) > 0:
+            D1s = torch.stack(D1s, dim=1) # (B, K)
+            C_inv_p = torch.linalg.inv(C[:-1, :-1])
+            A_p = C_inv_p
+        if use_corrector:
+            # print('using corrector')
+            C_inv = torch.linalg.inv(C)
+            A_c = C_inv
+        hh = h
+        h_phi_1 = torch.expm1(hh)
+        h_phi_ks = []
+        factorial_k = 1
+        h_phi_k = h_phi_1
+        for k in range(1, K + 2):
+            h_phi_ks.append(h_phi_k)
+            h_phi_k = h_phi_k / hh - 1 / factorial_k
+            factorial_k *= (k + 1)
+        model_t = None
+        if True:
+            log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t)
+            x_t_ = (
+                expand_dims((torch.exp(log_alpha_t - log_alpha_prev_0)),dims) * x
+                - expand_dims((sigma_t * h_phi_1),dims) * model_prev_0
+            )
+            # now predictor
+            x_t = x_t_
+            if len(D1s) > 0:
+                # compute the residuals for predictor
+                for k in range(K - 1):
+                    x_t = x_t - expand_dims(sigma_t * h_phi_ks[k + 1],dims) * torch.einsum('bkchw,k->bchw', D1s, A_p[k])
+            # now corrector
+            if use_corrector:
+                model_t = self.noise_prediction_fn(x_t, t)
+                D1_t = (model_t - model_prev_0)
+                x_t = x_t_
+                k = 0
+                for k in range(K - 1):
+                    x_t = x_t - expand_dims(sigma_t * h_phi_ks[k + 1],dims) * torch.einsum('bkchw,k->bchw', D1s, A_c[k][:-1])
+                x_t = x_t - expand_dims(sigma_t * h_phi_ks[K],dims) * (D1_t * A_c[k][-1])
+        return x_t, model_t
+    def sample(
+        self,
+        x,
+        model_fn,
+        order,
+        use_corrector,
+        lower_order_final,
+        start_free_u_step=None,
+        free_u_apply_callback=None,
+        free_u_stop_callback=None,
+        npnet_x = None,
+        npnet_scale = None,
+        half=False,
+        return_intermediate=False,
+    ):
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        steps = self.steps
+        vec_t = self.timesteps[0].expand((x.shape[0]))
+        if free_u_stop_callback is not None:
+            free_u_stop_callback()
+        if start_free_u_step is not None and 0 == start_free_u_step and free_u_apply_callback is not None:
+            free_u_apply_callback()
+            has_called_free_u = True
+        if not self.use_afs:
+            fir_output = self.noise_prediction_fn(x, vec_t)
+        else:
+            fir_output = x * 0.97  # ultilize npnet there in the future
+            if npnet_x is not None and npnet_scale is not None:
+                fir_output = npnet_x
+                # fir_output = fir_output - npnet_scale * (npnet_out - fir_output) #guidance_scale * (noise - noise_uncond)
+                x = fir_output.clone().detach().to(fir_output.device)
+        model_prev_list = [fir_output]
+        full_cache = [fir_output]
+        t_prev_list = [vec_t]
+        has_called_free_u = False
+        for init_order in range(1, order):
+            if start_free_u_step is not None and init_order == start_free_u_step and free_u_apply_callback is not None and (not has_called_free_u):
+                free_u_apply_callback()
+                has_called_free_u = True
+            vec_t = self.timesteps[init_order].expand(x.shape[0])
+            x, model_x = self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, init_order, use_corrector=True)
+            if model_x is None:
+                model_x = self.noise_prediction_fn(x, vec_t)
+                x = model_x.clone().detach().to(torch.float32).to(model_x.device)
+            full_cache.append(x)
+            model_prev_list.append(model_x)
+            t_prev_list.append(vec_t)
+        for step in range(order, steps + 1):
+            if start_free_u_step is not None and step == start_free_u_step and free_u_apply_callback is not None and (not has_called_free_u):
+                free_u_apply_callback()
+            vec_t = self.timesteps[step].expand(x.shape[0])
+            if lower_order_final:
+                step_order = min(order, steps + 1 - step)
+            else:
+                step_order = order
+            # print('this step order:', step_order)
+            if step == steps:
+                # print('do not run corrector at the last step')
+                use_corrector = False
+            else:
+                use_corrector = True
+            x, model_x =  self.multistep_uni_pc_update(x, model_prev_list, t_prev_list, vec_t, step_order, use_corrector=use_corrector)
+            for i in range(order - 1):
+                t_prev_list[i] = t_prev_list[i + 1]
+                model_prev_list[i] = model_prev_list[i + 1]
+            t_prev_list[-1] = vec_t
+                    # We do not need to evaluate the final model value.
+            full_cache.append(x)
+            if step < steps:
+                if model_x is None:
+                    model_x = self.noise_prediction_fn(x, vec_t)
+                model_prev_list[-1] = model_x
+        return x, full_cache
+    def sample_mix(
+        self,
+        x,
+        model_fn,
+        order,
+        use_corrector,
+        lower_order_final,
+        start_free_u_step=None,
+        free_u_apply_callback=None,
+        free_u_stop_callback=None,
+        noise_level = 0.1,
+        half=False,
+        return_intermediate=False,
+    ):
+        self.model = lambda x, t: model_fn(x, t.expand((x.shape[0])))
+        steps = self.steps
+        vec_t = self.timesteps[0].expand((x.shape[0]))
+        fir_output = self.noise_prediction_fn(x, vec_t)
+        model_prev_list = [fir_output]
+        full_cache = [fir_output]
+        t_prev_list = [vec_t]
+        has_called_free_u = False
+        if free_u_stop_callback is not None:
+            free_u_stop_callback()
+        for init_order in range(1, order):
+            if start_free_u_step is not None and init_order == start_free_u_step and free_u_apply_callback is not None:
+                free_u_apply_callback()
+                has_called_free_u = True
+            vec_t = self.timesteps[init_order].expand(x.shape[0])
+            if start_free_u_step is not None and init_order >= start_free_u_step and free_u_apply_callback is not None:
+                x, model_x = self.multistep_uni_pc_sde_update(x
+                                                              , model_prev_list
+                                                              , t_prev_list
+                                                              , vec_t
+                                                              , init_order
+                                                              , use_corrector=True
+                                                              ,level=noise_level)
+            else:
+                x, model_x = self.multistep_uni_pc_sde_update(x
+                                                              , model_prev_list
+                                                              , t_prev_list
+                                                              , vec_t
+                                                              , init_order
+                                                              , use_corrector=True
+                                                              ,level=0.0)
+            if model_x is None:
+                model_x = self.noise_prediction_fn(x, vec_t)
+                x = model_x.clone().detach().to(torch.float32).to(model_x.device)
+            full_cache.append(x)
+            model_prev_list.append(model_x)
+            t_prev_list.append(vec_t)
+        if free_u_stop_callback is not None:
+            free_u_stop_callback()
+        for step in range(order, steps + 1):
+            if start_free_u_step is not None and step == start_free_u_step and free_u_apply_callback is not None and (not has_called_free_u):
+                free_u_apply_callback()
+            vec_t = self.timesteps[step].expand(x.shape[0])
+            if lower_order_final:
+                step_order = min(order, steps + 1 - step)
+            else:
+                step_order = order
+            # print('this step order:', step_order)
+            if step == steps:
+                # print('do not run corrector at the last step')
+                use_corrector = False
+            else:
+                use_corrector = True
+            if start_free_u_step is not None and step >= start_free_u_step and free_u_apply_callback is not None:
+                x, model_x =  self.multistep_uni_pc_sde_update(x
+                                                               , model_prev_list
+                                                               , t_prev_list
+                                                               , vec_t
+                                                               , step_order
+                                                               , use_corrector=use_corrector
+                                                               , level=noise_level)
+            else:
+                x, model_x =  self.multistep_uni_pc_sde_update(x
+                                                               , model_prev_list
+                                                               , t_prev_list
+                                                               , vec_t
+                                                               , step_order
+                                                               , use_corrector=use_corrector
+                                                               , level=0.0)
+            for i in range(order - 1):
+                t_prev_list[i] = t_prev_list[i + 1]
+                model_prev_list[i] = model_prev_list[i + 1]
+            t_prev_list[-1] = vec_t
+                    # We do not need to evaluate the final model value.
+            full_cache.append(x)
+            if step < steps:
+                if model_x is None:
+                    model_x = self.noise_prediction_fn(x, vec_t)
+                model_prev_list[-1] = model_x
+        return x, full_cache
+#############################################################
+# other utility functions
+#############################################################
+def interpolate_fn(x, xp, yp):
+    """
+    A piecewise linear function y = f(x), using xp and yp as keypoints.
+    We implement f(x) in a differentiable way (i.e. applicable for autograd).
+    The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.)
+    Args:
+        x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver).
+        xp: PyTorch tensor with shape [C, K], where K is the number of keypoints.
+        yp: PyTorch tensor with shape [C, K].
+    Returns:
+        The function values f(x), with shape [N, C].
+    """
+    N, K = x.shape[0], xp.shape[1]
+    all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2)
+    sorted_all_x, x_indices = torch.sort(all_x, dim=2)
+    x_idx = torch.argmin(x_indices, dim=2)
+    cand_start_idx = x_idx - 1
+    start_idx = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(1, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1)
+    start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2)
+    end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2)
+    start_idx2 = torch.where(
+        torch.eq(x_idx, 0),
+        torch.tensor(0, device=x.device),
+        torch.where(
+            torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx,
+        ),
+    )
+    y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1)
+    start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2)
+    end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2)
+    cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x)
+    return cand
+def expand_dims(v, dims):
+    """
+    Expand the tensor `v` to the dim `dims`.
+    Args:
+        `v`: a PyTorch tensor with shape [N].
+        `dim`: a `int`.
+    Returns:
+        a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`.
+    """
+    return v[(...,) + (None,)*(dims - 1)]