rookie9 commited on Sep 13, 2025

Commit

79f3e78

verified ·

1 Parent(s): cb9f65f

Upload 77 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

config.json +103 -0
ezaudio_vae/1m.pt +3 -0
model.py +151 -0
model_index.json +6 -0
models/__pycache__/common.cpython-310.pyc +0 -0
models/__pycache__/content_adapter.cpython-310.pyc +0 -0
models/__pycache__/diffusion.cpython-310.pyc +0 -0
models/__pycache__/diffusion_cfg.cpython-310.pyc +0 -0
models/__pycache__/diffusion_cfg_new.cpython-310.pyc +0 -0
models/__pycache__/diffusion_content_cfg.cpython-310.pyc +0 -0
models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc +0 -0
models/autoencoder/autoencoder_base.py +22 -0
models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc +0 -0
models/autoencoder/waveform/stable_vae.py +537 -0
models/common.py +69 -0
models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/content_test.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc +0 -0
models/content_encoder/__pycache__/text_encoder.cpython-310.pyc +0 -0
models/content_encoder/caption_encoder.py +116 -0
models/content_encoder/text_encoder.py +76 -0
models/diffusion.py +398 -0
models/dit/__pycache__/attention.cpython-310.pyc +0 -0
models/dit/__pycache__/audio_dit.cpython-310.pyc +0 -0
models/dit/__pycache__/mask_dit.cpython-310.pyc +0 -0
models/dit/__pycache__/modules.cpython-310.pyc +0 -0
models/dit/__pycache__/rotary.cpython-310.pyc +0 -0
models/dit/__pycache__/span_mask.cpython-310.pyc +0 -0
models/dit/attention.py +350 -0
models/dit/audio_diffsingernet_dit.py +520 -0
models/dit/audio_dit.py +549 -0
models/dit/mask_dit.py +823 -0

config.json ADDED Viewed

	@@ -0,0 +1,103 @@

+{
+  "model_type": "PicoAudio2",
+  "autoencoder": {
+    "_target_": "models.autoencoder.waveform.stable_vae.StableVAE",
+    "encoder": {
+      "_target_": "models.autoencoder.waveform.stable_vae.OobleckEncoder",
+      "in_channels": 1,
+      "channels": 128,
+      "c_mults": [
+        1,
+        2,
+        4,
+        8
+      ],
+      "strides": [
+        2,
+        4,
+        6,
+        10
+      ],
+      "latent_dim": 256,
+      "use_snake": true
+    },
+    "decoder": {
+      "_target_": "models.autoencoder.waveform.stable_vae.OobleckDecoder",
+      "out_channels": 1,
+      "channels": 128,
+      "c_mults": [
+        1,
+        2,
+        4,
+        8
+      ],
+      "strides": [
+        2,
+        4,
+        6,
+        10
+      ],
+      "latent_dim": 128,
+      "use_snake": true,
+      "final_tanh": false
+    },
+    "io_channels": 1,
+    "latent_dim": 128,
+    "downsampling_ratio": 480,
+    "sample_rate": 24000,
+    "pretrained_ckpt": "/mnt/petrelfs/zhengzihao/cache/ezaudio/ckpts/vae/1m.pt",
+    "bottleneck": {
+      "_target_": "models.autoencoder.waveform.stable_vae.VAEBottleneck"
+    }
+  },
+  "backbone": {
+    "_target_": "models.dit.audio_dit.AudioUDiT",
+    "img_size": 1000,
+    "patch_size": 1,
+    "in_chans": 128,
+    "out_chans": 128,
+    "input_type": "1d",
+    "embed_dim": 1024,
+    "depth": 24,
+    "num_heads": 16,
+    "mlp_ratio": 4.0,
+    "qkv_bias": false,
+    "qk_scale": null,
+    "qk_norm": "layernorm",
+    "norm_layer": "layernorm",
+    "act_layer": "geglu",
+    "context_norm": true,
+    "use_checkpoint": true,
+    "time_fusion": "ada_sola_bias",
+    "ada_sola_rank": 32,
+    "ada_sola_alpha": 32,
+    "cls_dim": null,
+    "ta_context_dim": 1024,
+    "ta_context_fusion": "concat",
+    "ta_context_norm": true,
+    "context_dim": 1024,
+    "context_fusion": "cross",
+    "context_max_length": null,
+    "context_pe_method": "none",
+    "pe_method": "none",
+    "rope_mode": "shared",
+    "use_conv": true,
+    "skip": true,
+    "skip_norm": true
+  },
+  "_target_": "models.diffusion.AudioDiffusion",
+  "content_encoder": {
+    "_target_": "models.content_encoder.caption_encoder.ContentEncoder",
+    "text_encoder": {
+      "_target_": "models.content_encoder.text_encoder.T5TextEncoder",
+      "model_name": "/mnt/petrelfs/zhengzihao/cache/google-flan-t5-large"
+    }
+  },
+  "frame_resolution": 0.005,
+  "noise_scheduler_name": "/mnt/petrelfs/zhengzihao/cache/stabilityai-stable-diffusion-2-1",
+  "snr_gamma": 5.0,
+  "classifier_free_guidance": true,
+  "cfg_drop_ratio": 0.2,
+  "num_steps": 50,
+  "guidance_scale": 7.5
+}

ezaudio_vae/1m.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3cb13e2699fa922ce6a2b3b4f53c270ec64156e0cc3f3e3645e10cdf98b740dc
+size 183037614

model.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import torch
+from transformers import PreTrainedModel, PretrainedConfig
+import inspect, importlib
+from safetensors.torch import load_file
+from models.diffusion import AudioDiffusion
+class PicoAudio2Config(PretrainedConfig):
+    model_type = "PicoAudio2"
+    def __init__(
+        self,
+        autoencoder=None,
+        content_encoder=None,
+        backbone=None,
+        frame_resolution: float = 0.005,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = 5.0,
+        classifier_free_guidance: bool = True,
+        cfg_drop_ratio: float = 0.2,
+        num_steps: int = 50,
+        guidance_scale: float = 7.5,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.autoencoder = autoencoder
+        self.content_encoder = content_encoder
+        self.backbone = backbone
+        self.frame_resolution = frame_resolution
+        self.noise_scheduler_name = noise_scheduler_name
+        self.snr_gamma = snr_gamma
+        self.classifier_free_guidance = classifier_free_guidance
+        self.cfg_drop_ratio = cfg_drop_ratio
+        self.num_steps = num_steps
+        self.guidance_scale = guidance_scale
+class PicoAudio2HF(PreTrainedModel):
+    config_class = PicoAudio2Config
+    def __init__(self, config: PicoAudio2Config):
+        super().__init__(config)
+        autoencoder = self._build_submodule(config.autoencoder)
+        content_encoder = self.build_content_encoder_from_config(config.content_encoder)
+        backbone = self._build_submodule(config.backbone)
+        state_dict = load_file("model.safetensors")
+        new_state_dict = {k.replace("backbone.", ""): v for k, v in state_dict.items()}
+        backbone.load_state_dict(new_state_dict, strict=False, assign=True)
+        self.inner_model = AudioDiffusion(
+            autoencoder=autoencoder,
+            content_encoder=content_encoder,
+            backbone=backbone,
+            frame_resolution=config.frame_resolution,
+            noise_scheduler_name=config.noise_scheduler_name,
+            snr_gamma=config.snr_gamma,
+            classifier_free_guidance=config.classifier_free_guidance,
+            cfg_drop_ratio=config.cfg_drop_ratio,
+        )
+    def build_content_encoder_from_config(self, content_encoder_cfg):
+        te_cfg = content_encoder_cfg['text_encoder']
+        te_mod_path, te_cls_name = te_cfg['_target_'].rsplit('.', 1)
+        te_mod = importlib.import_module(te_mod_path)
+        TextEncoderClass = getattr(te_mod, te_cls_name)
+        text_encoder = TextEncoderClass(model_name=te_cfg['model_name'])
+        ce_mod_path, ce_cls_name = content_encoder_cfg['_target_'].rsplit('.', 1)
+        ce_mod = importlib.import_module(ce_mod_path)
+        ContentEncoderClass = getattr(ce_mod, ce_cls_name)
+        content_encoder = ContentEncoderClass(text_encoder=text_encoder)
+        return content_encoder
+    def _build_submodule(self, sub_config):
+        import inspect
+        if sub_config is None:
+            return None
+        if isinstance(sub_config, dict) and "_target_" in sub_config:
+            kwargs = {}
+            for k, v in sub_config.items():
+                if k == "_target_":
+                    continue
+                if isinstance(v, dict) and "_target_" in v:
+                    kwargs[k] = self._build_submodule(v)
+                else:
+                    kwargs[k] = v
+            module_path, class_name = sub_config["_target_"].rsplit(".", 1)
+            module = __import__(module_path, fromlist=[class_name])
+            cls = getattr(module, class_name)
+            obj = cls(**kwargs)
+            if "pretrained_ckpt" in sub_config:
+                state_dict = torch.load(sub_config["pretrained_ckpt"])
+                if "state_dict" in state_dict:
+                    new_state_dict = state_dict["state_dict"]
+                state_dict = {k.replace("autoencoder.", ""): v for k, v in new_state_dict.items()}
+                sig = inspect.signature(obj.load_state_dict)
+                if "assign" in sig.parameters:
+                    result = obj.load_state_dict(state_dict, strict=False, assign=True)
+                else:
+                    result = obj.load_state_dict(state_dict, strict=False)
+                self._check_param_stats(obj, class_name)
+            return obj
+        else:
+            return sub_config
+    def _check_weights(self, module, name):
+        if hasattr(module, "load_state_dict") and hasattr(module, "state_dict"):
+            print(f"[{name}] parameter keys:", list(module.state_dict().keys())[:5], "...")
+            for idx, (k, v) in enumerate(module.state_dict().items()):
+                print(f"[{name}] {k}: mean={v.float().mean():.5f}, std={v.float().std():.5f}")
+                if idx >= 2:
+                    break
+    def _check_param_stats(self, module, name):
+        if hasattr(module, "named_parameters"):
+            for idx, (k, v) in enumerate(module.named_parameters()):
+                print(f"[{name}] {k}: mean={v.data.float().mean():.5f}, std={v.data.float().std():.5f}")
+                if idx >= 2:
+                    break
+    def forward(
+        self,
+        content,
+        num_steps=None,
+        guidance_scale=None,
+        guidance_rescale=0.0,
+        disable_progress=True,
+        num_samples_per_content=1,
+        **kwargs
+    ):
+        num_steps = num_steps if num_steps is not None else self.config.num_steps
+        guidance_scale = guidance_scale if guidance_scale is not None else self.config.guidance_scale
+        return self.inner_model.inference(
+            content=[content],
+            num_steps=num_steps,
+            guidance_scale=guidance_scale,
+            guidance_rescale=guidance_rescale,
+            disable_progress=disable_progress,
+            num_samples_per_content=num_samples_per_content,
+            **kwargs
+        )
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
+        config = PicoAudio2Config.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        model = cls(config)
+        return model
+    def load_state_dict(self, state_dict, *args, **kwargs):
+        pass

model_index.json ADDED Viewed

	@@ -0,0 +1,6 @@

+{
+    "auto_map": {
+      "AutoConfig": "model.PicoAudio2Config",
+      "AutoModel": "model.PicoAudio2HF"
+    }
+  }

models/__pycache__/common.cpython-310.pyc ADDED Viewed

Binary file (3.1 kB). View file

models/__pycache__/content_adapter.cpython-310.pyc ADDED Viewed

Binary file (3.87 kB). View file

models/__pycache__/diffusion.cpython-310.pyc ADDED Viewed

Binary file (10.5 kB). View file

models/__pycache__/diffusion_cfg.cpython-310.pyc ADDED Viewed

Binary file (18.9 kB). View file

models/__pycache__/diffusion_cfg_new.cpython-310.pyc ADDED Viewed

Binary file (18.8 kB). View file

models/__pycache__/diffusion_content_cfg.cpython-310.pyc ADDED Viewed

Binary file (18.5 kB). View file

models/autoencoder/__pycache__/autoencoder_base.cpython-310.pyc ADDED Viewed

Binary file (1.06 kB). View file

models/autoencoder/autoencoder_base.py ADDED Viewed

	@@ -0,0 +1,22 @@

+from abc import abstractmethod, ABC
+from typing import Sequence
+import torch
+import torch.nn as nn
+class AutoEncoderBase(ABC):
+    def __init__(
+        self, downsampling_ratio: int, sample_rate: int,
+        latent_shape: Sequence[int | None]
+    ):
+        self.downsampling_ratio = downsampling_ratio
+        self.sample_rate = sample_rate
+        self.latent_token_rate = sample_rate // downsampling_ratio
+        self.latent_shape = latent_shape
+        self.time_dim = latent_shape.index(None) + 1  # the first dim is batch
+    @abstractmethod
+    def encode(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        ...

models/autoencoder/waveform/__pycache__/stable_vae.cpython-310.pyc ADDED Viewed

Binary file (12 kB). View file

models/autoencoder/waveform/stable_vae.py ADDED Viewed

	@@ -0,0 +1,537 @@

+from typing import Any, Literal, Callable
+import math
+from pathlib import Path
+import torch
+import torch.nn as nn
+from torch.nn.utils import weight_norm
+import torchaudio
+from alias_free_torch import Activation1d
+from models.common import LoadPretrainedBase
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from utils.torch_utilities import remove_key_prefix_factory, create_mask_from_length
+# jit script make it 1.4x faster and save GPU memory
+@torch.jit.script
+def snake_beta(x, alpha, beta):
+    return x + (1.0 / (beta+0.000000001)) * pow(torch.sin(x * alpha), 2)
+class SnakeBeta(nn.Module):
+    def __init__(
+        self,
+        in_features,
+        alpha=1.0,
+        alpha_trainable=True,
+        alpha_logscale=True
+    ):
+        super(SnakeBeta, self).__init__()
+        self.in_features = in_features
+        # initialize alpha
+        self.alpha_logscale = alpha_logscale
+        if self.alpha_logscale:
+            # log scale alphas initialized to zeros
+            self.alpha = nn.Parameter(torch.zeros(in_features) * alpha)
+            self.beta = nn.Parameter(torch.zeros(in_features) * alpha)
+        else:
+            # linear scale alphas initialized to ones
+            self.alpha = nn.Parameter(torch.ones(in_features) * alpha)
+            self.beta = nn.Parameter(torch.ones(in_features) * alpha)
+        self.alpha.requires_grad = alpha_trainable
+        self.beta.requires_grad = alpha_trainable
+        # self.no_div_by_zero = 0.000000001
+    def forward(self, x):
+        alpha = self.alpha.unsqueeze(0).unsqueeze(-1)
+        # line up with x to [B, C, T]
+        beta = self.beta.unsqueeze(0).unsqueeze(-1)
+        if self.alpha_logscale:
+            alpha = torch.exp(alpha)
+            beta = torch.exp(beta)
+        x = snake_beta(x, alpha, beta)
+        return x
+def WNConv1d(*args, **kwargs):
+    return weight_norm(nn.Conv1d(*args, **kwargs))
+def WNConvTranspose1d(*args, **kwargs):
+    return weight_norm(nn.ConvTranspose1d(*args, **kwargs))
+def get_activation(
+    activation: Literal["elu", "snake", "none"],
+    antialias=False,
+    channels=None
+) -> nn.Module:
+    if activation == "elu":
+        act = nn.ELU()
+    elif activation == "snake":
+        act = SnakeBeta(channels)
+    elif activation == "none":
+        act = nn.Identity()
+    else:
+        raise ValueError(f"Unknown activation {activation}")
+    if antialias:
+        act = Activation1d(act)
+    return act
+class ResidualUnit(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        dilation,
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        self.dilation = dilation
+        padding = (dilation * (7-1)) // 2
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                dilation=dilation,
+                padding=padding
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=out_channels
+            ),
+            WNConv1d(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                kernel_size=1
+            )
+        )
+    def forward(self, x):
+        res = x
+        #x = checkpoint(self.layers, x)
+        x = self.layers(x)
+        return x + res
+class EncoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        self.layers = nn.Sequential(
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=1,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=3,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                dilation=9,
+                use_snake=use_snake
+            ),
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels
+            ),
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2)
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class DecoderBlock(nn.Module):
+    def __init__(
+        self,
+        in_channels,
+        out_channels,
+        stride,
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False
+    ):
+        super().__init__()
+        if use_nearest_upsample:
+            upsample_layer = nn.Sequential(
+                nn.Upsample(scale_factor=stride, mode="nearest"),
+                WNConv1d(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=2 * stride,
+                    stride=1,
+                    bias=False,
+                    padding='same'
+                )
+            )
+        else:
+            upsample_layer = WNConvTranspose1d(
+                in_channels=in_channels,
+                out_channels=out_channels,
+                kernel_size=2 * stride,
+                stride=stride,
+                padding=math.ceil(stride / 2)
+            )
+        self.layers = nn.Sequential(
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=in_channels
+            ),
+            upsample_layer,
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=1,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=3,
+                use_snake=use_snake
+            ),
+            ResidualUnit(
+                in_channels=out_channels,
+                out_channels=out_channels,
+                dilation=9,
+                use_snake=use_snake
+            ),
+        )
+    def forward(self, x):
+        return self.layers(x)
+class OobleckEncoder(nn.Module):
+    def __init__(
+        self,
+        in_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=in_channels,
+                out_channels=c_mults[0] * channels,
+                kernel_size=7,
+                padding=3
+            )
+        ]
+        for i in range(self.depth - 1):
+            layers += [
+                EncoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i + 1] * channels,
+                    stride=strides[i],
+                    use_snake=use_snake
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[-1] * channels
+            ),
+            WNConv1d(
+                in_channels=c_mults[-1] * channels,
+                out_channels=latent_dim,
+                kernel_size=3,
+                padding=1
+            )
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class OobleckDecoder(nn.Module):
+    def __init__(
+        self,
+        out_channels=2,
+        channels=128,
+        latent_dim=32,
+        c_mults=[1, 2, 4, 8],
+        strides=[2, 4, 8, 8],
+        use_snake=False,
+        antialias_activation=False,
+        use_nearest_upsample=False,
+        final_tanh=True
+    ):
+        super().__init__()
+        c_mults = [1] + c_mults
+        self.depth = len(c_mults)
+        layers = [
+            WNConv1d(
+                in_channels=latent_dim,
+                out_channels=c_mults[-1] * channels,
+                kernel_size=7,
+                padding=3
+            ),
+        ]
+        for i in range(self.depth - 1, 0, -1):
+            layers += [
+                DecoderBlock(
+                    in_channels=c_mults[i] * channels,
+                    out_channels=c_mults[i - 1] * channels,
+                    stride=strides[i - 1],
+                    use_snake=use_snake,
+                    antialias_activation=antialias_activation,
+                    use_nearest_upsample=use_nearest_upsample
+                )
+            ]
+        layers += [
+            get_activation(
+                "snake" if use_snake else "elu",
+                antialias=antialias_activation,
+                channels=c_mults[0] * channels
+            ),
+            WNConv1d(
+                in_channels=c_mults[0] * channels,
+                out_channels=out_channels,
+                kernel_size=7,
+                padding=3,
+                bias=False
+            ),
+            nn.Tanh() if final_tanh else nn.Identity()
+        ]
+        self.layers = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.layers(x)
+class Bottleneck(nn.Module):
+    def __init__(self, is_discrete: bool = False):
+        super().__init__()
+        self.is_discrete = is_discrete
+    def encode(self, x, return_info=False, **kwargs):
+        raise NotImplementedError
+    def decode(self, x):
+        raise NotImplementedError
+@torch.jit.script
+def vae_sample(mean, scale) -> dict[str, torch.Tensor]:
+    stdev = nn.functional.softplus(scale) + 1e-4
+    var = stdev * stdev
+    logvar = torch.log(var)
+    latents = torch.randn_like(mean) * stdev + mean
+    kl = (mean*mean + var - logvar - 1).sum(1).mean()
+    return {"latents": latents, "kl": kl}
+class VAEBottleneck(Bottleneck):
+    def __init__(self):
+        super().__init__(is_discrete=False)
+    def encode(self,
+               x,
+               return_info=False,
+               **kwargs) -> dict[str, torch.Tensor] | torch.Tensor:
+        mean, scale = x.chunk(2, dim=1)
+        sampled = vae_sample(mean, scale)
+        if return_info:
+            return sampled["latents"], {"kl": sampled["kl"]}
+        else:
+            return sampled["latents"]
+    def decode(self, x):
+        return x
+def compute_mean_kernel(x, y):
+    kernel_input = (x[:, None] - y[None]).pow(2).mean(2) / x.shape[-1]
+    return torch.exp(-kernel_input).mean()
+class Pretransform(nn.Module):
+    def __init__(self, enable_grad, io_channels, is_discrete):
+        super().__init__()
+        self.is_discrete = is_discrete
+        self.io_channels = io_channels
+        self.encoded_channels = None
+        self.downsampling_ratio = None
+        self.enable_grad = enable_grad
+    def encode(self, x):
+        raise NotImplementedError
+    def decode(self, z):
+        raise NotImplementedError
+    def tokenize(self, x):
+        raise NotImplementedError
+    def decode_tokens(self, tokens):
+        raise NotImplementedError
+class StableVAE(LoadPretrainedBase, AutoEncoderBase):
+    def __init__(
+        self,
+        encoder,
+        decoder,
+        latent_dim,
+        downsampling_ratio,
+        sample_rate,
+        io_channels=2,
+        bottleneck: Bottleneck = None,
+        pretransform: Pretransform = None,
+        in_channels=None,
+        out_channels=None,
+        soft_clip=False,
+        pretrained_ckpt: str | Path = None
+    ):
+        LoadPretrainedBase.__init__(self)
+        AutoEncoderBase.__init__(
+            self,
+            downsampling_ratio=downsampling_ratio,
+            sample_rate=sample_rate,
+            latent_shape=(latent_dim, None)
+        )
+        self.latent_dim = latent_dim
+        self.io_channels = io_channels
+        self.in_channels = io_channels
+        self.out_channels = io_channels
+        self.min_length = self.downsampling_ratio
+        if in_channels is not None:
+            self.in_channels = in_channels
+        if out_channels is not None:
+            self.out_channels = out_channels
+        self.bottleneck = bottleneck
+        self.encoder = encoder
+        self.decoder = decoder
+        self.pretransform = pretransform
+        self.soft_clip = soft_clip
+        self.is_discrete = self.bottleneck is not None and self.bottleneck.is_discrete
+        self.remove_autoencoder_prefix_fn: Callable = remove_key_prefix_factory(
+            "autoencoder."
+        )
+        if pretrained_ckpt is not None:
+            self.load_pretrained(pretrained_ckpt)
+    def process_state_dict(self, model_dict, state_dict):
+        state_dict = state_dict["state_dict"]
+        state_dict = self.remove_autoencoder_prefix_fn(model_dict, state_dict)
+        return state_dict
+    def encode(
+        self, waveform: torch.Tensor, waveform_lengths: torch.Tensor
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        z = self.encoder(waveform)
+        z = self.bottleneck.encode(z)
+        z_length = waveform_lengths // self.downsampling_ratio
+        z_mask = create_mask_from_length(z_length)
+        return z, z_mask
+    def decode(self, latents: torch.Tensor) -> torch.Tensor:
+        waveform = self.decoder(latents)
+        return waveform
+if __name__ == '__main__':
+    import hydra
+    from utils.config import generate_config_from_command_line_overrides
+    model_config = generate_config_from_command_line_overrides(
+        "configs/model/autoencoder/stable_vae.yaml"
+    )
+    autoencoder: StableVAE = hydra.utils.instantiate(model_config)
+    autoencoder.eval()
+    waveform, sr = torchaudio.load(
+        "/hpc_stor03/sjtu_home/xuenan.xu/workspace/singing_voice_synthesis/diffsinger/data/raw/opencpop/segments/wavs/2007000230.wav"
+    )
+    waveform = torchaudio.functional.resample(
+        waveform, sr, model_config["sample_rate"]
+    )
+    print("waveform: ", waveform.shape)
+    with torch.no_grad():
+        latent, latent_length = autoencoder.encode(
+            waveform, torch.as_tensor([waveform.shape[-1]])
+        )
+        print("latent: ", latent.shape)
+        reconstructed = autoencoder.decode(latent)
+        print("reconstructed: ", reconstructed.shape)
+    import soundfile as sf
+    sf.write(
+        "./reconstructed.wav",
+        reconstructed[0, 0].numpy(),
+        samplerate=model_config["sample_rate"]
+    )

models/common.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from pathlib import Path
+import torch
+import torch.nn as nn
+from utils.torch_utilities import load_pretrained_model, merge_matched_keys
+import warnings
+class LoadPretrainedBase(nn.Module):
+    def process_state_dict(
+        self, model_dict: dict[str, torch.Tensor],
+        state_dict: dict[str, torch.Tensor]
+    ):
+        """
+        Custom processing functions of each model that transforms `state_dict` loaded from
+        checkpoints to the state that can be used in `load_state_dict`.
+        Use `merge_mathced_keys` to update parameters with matched names and shapes by
+        default.
+        Args
+            model_dict:
+                The state dict of the current model, which is going to load pretrained parameters
+            state_dict:
+                A dictionary of parameters from a pre-trained model.
+            Returns:
+                dict[str, torch.Tensor]:
+                    The updated state dict, where parameters with matched keys and shape are
+                    updated with values in `state_dict`.
+        """
+        state_dict = merge_matched_keys(model_dict, state_dict)
+        return state_dict
+    def load_pretrained(self, ckpt_path: str | Path):
+        load_pretrained_model(
+            self, ckpt_path, state_dict_process_fn=self.process_state_dict
+        )
+class CountParamsBase(nn.Module):
+    def count_params(self):
+        num_params = 0
+        trainable_params = 0
+        for param in self.parameters():
+            num_params += param.numel()
+            if param.requires_grad:
+                trainable_params += param.numel()
+        return num_params, trainable_params
+class SaveTrainableParamsBase(nn.Module):
+    @property
+    def param_names_to_save(self):
+        names = []
+        for name, param in self.named_parameters():
+            if param.requires_grad:
+                names.append(name)
+        for name, _ in self.named_buffers():
+            names.append(name)
+        return names
+    def load_state_dict(self, state_dict, strict=True, assign=True):
+        print("State dict keys:", list(state_dict.keys()))
+        for key in self.param_names_to_save:
+            if key not in state_dict:
+                raise Exception(
+                    f"{key} not found in either pre-trained models (e.g. BERT)"
+                    " or resumed checkpoints (e.g. epoch_40/model.pt)"
+                )
+        # 兼容 PyTorch/transformers 的 assign 参数
+        return super().load_state_dict(state_dict, strict=strict, assign=assign)

models/content_encoder/__pycache__/caption_encoder.cpython-310.pyc ADDED Viewed

Binary file (3.51 kB). View file

models/content_encoder/__pycache__/content_encoder.cpython-310.pyc ADDED Viewed

Binary file (4.72 kB). View file

models/content_encoder/__pycache__/content_encoder_add_1024.cpython-310.pyc ADDED Viewed

Binary file (4.62 kB). View file

models/content_encoder/__pycache__/content_encoder_clap.cpython-310.pyc ADDED Viewed

Binary file (6.11 kB). View file

models/content_encoder/__pycache__/content_encoder_clap_test.cpython-310.pyc ADDED Viewed

Binary file (6.12 kB). View file

models/content_encoder/__pycache__/content_encoder_concat.cpython-310.pyc ADDED Viewed

Binary file (4.74 kB). View file

models/content_encoder/__pycache__/content_encoder_concat_4096.cpython-310.pyc ADDED Viewed

Binary file (4.69 kB). View file

models/content_encoder/__pycache__/content_encoder_concat_4096_random.cpython-310.pyc ADDED Viewed

Binary file (4.73 kB). View file

models/content_encoder/__pycache__/content_encoder_full.cpython-310.pyc ADDED Viewed

Binary file (5.01 kB). View file

models/content_encoder/__pycache__/content_encoder_full_non.cpython-310.pyc ADDED Viewed

Binary file (5 kB). View file

models/content_encoder/__pycache__/content_encoder_full_non_test.cpython-310.pyc ADDED Viewed

Binary file (4.87 kB). View file

models/content_encoder/__pycache__/content_encoder_full_test.cpython-310.pyc ADDED Viewed

Binary file (4.48 kB). View file

models/content_encoder/__pycache__/content_encoder_full_woonset.cpython-310.pyc ADDED Viewed

Binary file (4.59 kB). View file

models/content_encoder/__pycache__/content_encoder_merge.cpython-310.pyc ADDED Viewed

Binary file (4.78 kB). View file

models/content_encoder/__pycache__/content_encoder_merge_test.cpython-310.pyc ADDED Viewed

Binary file (4.82 kB). View file

models/content_encoder/__pycache__/content_encoder_replace.cpython-310.pyc ADDED Viewed

Binary file (4.71 kB). View file

models/content_encoder/__pycache__/content_encoder_replace_merge.cpython-310.pyc ADDED Viewed

Binary file (4.72 kB). View file

models/content_encoder/__pycache__/content_encoder_replace_new.cpython-310.pyc ADDED Viewed

Binary file (4.71 kB). View file

models/content_encoder/__pycache__/content_encoder_test.cpython-310.pyc ADDED Viewed

Binary file (4.58 kB). View file

models/content_encoder/__pycache__/content_test.cpython-310.pyc ADDED Viewed

Binary file (4.71 kB). View file

models/content_encoder/__pycache__/new_content_encoder.cpython-310.pyc ADDED Viewed

Binary file (4.73 kB). View file

models/content_encoder/__pycache__/text_encoder.cpython-310.pyc ADDED Viewed

Binary file (2.71 kB). View file

models/content_encoder/caption_encoder.py ADDED Viewed

	@@ -0,0 +1,116 @@

+from typing import Any
+import torch
+import torch.nn as nn
+import random
+from utils.audiotime_event_merge import replace_event_synonyms
+def decode_data(line_onset_str, latent_length):
+    """
+    Extracts a timestamp matrix (event onset indices) from a formatted onset string.
+    Args:
+        line_onset_str (str): String containing event names and onset intervals,
+            formatted like "event1__start1-end1_start2-end2--event2__start1-end1".
+        latent_length (int): Length of the output matrix.
+    Returns:
+        line_onset_index (torch.Tensor): Matrix of shape [4, latent_length],
+        line_event (list): List of event names extracted from the onset string.
+    Notes:
+        - 24000 is the audio sample rate.
+        - 480 is the downsample ratio to align with VAE.
+        - Each onset interval "start-end" (in seconds) is converted to embedding indices via (time * 24000 / 480).
+    """
+    line_onset_index = torch.zeros((4, latent_length)) # max for 4 events
+    line_event = []
+    event_idx = 0
+    for event_onset in line_onset_str.split('--'):
+        #print(event_onset)
+        (event, instance) = event_onset.split('__')
+        #print(instance)
+        line_event.append(event)
+        for start_end in instance.split('_'):
+            (start, end) = start_end.split('-')
+            start, end = int(float(start)*24000/480), int(float(end)*24000/480)
+            if end > (latent_length - 1): break
+            line_onset_index[event_idx, start: end] = 1
+        event_idx = event_idx + 1
+    return line_onset_index, line_event
+class ContentEncoder(nn.Module):
+    """
+    ContentEncoder encodes TCC and TDC information.
+    """
+    def __init__(
+        self,
+        text_encoder: nn.Module= None,
+    ):
+        super().__init__()
+        self.text_encoder = text_encoder
+        self.pool = nn.AdaptiveAvgPool1d(1)
+    def encode_content(
+        self, batch_content: list[Any], device: str | torch.device
+    ):
+        batch_output = []
+        batch_mask = []
+        batch_onset = []
+        length_list = []
+        print(batch_content)
+        for content in batch_content:
+            caption = content["caption"]
+            onset = content["onset"]
+            length = int(float(content["length"]) *24000/480)
+                # Replacement for AudioTime
+            print(onset)
+            replace_label = content.get("replace_label", "False")
+            if replace_label == "True":
+                caption, onset = replace_event_synonyms(caption, onset)
+            # Handle random onset case for read data without timestamp
+            if content["onset"] == "random":
+                length_list.append(length)
+                """
+                fixed embedding. Actually it's a sick sentence, a error during training, kept to match the checkpoint.
+                You can change it to sentence that difference to captions in datasets.
+                The use of fixed text to obtain encoding is for numerical stability.
+                We attempted to use learnable unified encoding during training, but the results were not satisfactory.
+                """
+                event = "There is no event here"
+                event_embed = self.text_encoder([event.replace("_", " ")])["output"]
+                event_embed = self.pool(event_embed.permute(0, 2, 1))  # (B, 1024, 1)
+                event_embed = event_embed.flatten().unsqueeze(0)
+                new_onset = event_embed.repeat(length, 1).T
+            else:
+                onset_matrix, events = decode_data(onset, length)
+                length_list.append(length)
+                new_onset = torch.zeros((1024, length), device=device) # 1024 for T5
+                # TDC
+                for (idx, event) in enumerate(events):
+                    with torch.no_grad():
+                        event_embed = self.text_encoder([event.replace("_", " ")])["output"]
+                    event_embed = self.pool(event_embed.permute(0, 2, 1))  # (B, 1024, 1)
+                    event_embed = event_embed.flatten().unsqueeze(0)
+                    mask = (onset_matrix[idx, :] == 0)
+                    cols = mask.nonzero(as_tuple=True)[0]
+                    new_onset[:, cols] += event_embed.T.float()
+            # TCC
+            output_dict = self.text_encoder([caption])
+            batch_output.append(output_dict["output"][0])
+            batch_mask.append(output_dict["mask"][0])
+            batch_onset.append(new_onset)
+        # Pad all sequences in the batch to the same length for batching
+        batch_output = nn.utils.rnn.pad_sequence(
+            batch_output, batch_first=True, padding_value=0
+        )
+        batch_mask = nn.utils.rnn.pad_sequence(
+            batch_mask, batch_first=True, padding_value=False
+        )
+        batch_onset = nn.utils.rnn.pad_sequence(
+            batch_onset, batch_first=True, padding_value=0
+        )
+        return batch_output, batch_mask, batch_onset, length_list

models/content_encoder/text_encoder.py ADDED Viewed

	@@ -0,0 +1,76 @@

+import torch
+import torch.nn as nn
+from transformers import AutoTokenizer, AutoModel, T5Tokenizer, T5EncoderModel
+from transformers.modeling_outputs import BaseModelOutput
+try:
+    import torch_npu
+    from torch_npu.contrib import transfer_to_npu
+    DEVICE_TYPE = "npu"
+except ModuleNotFoundError:
+    DEVICE_TYPE = "cuda"
+class TransformersTextEncoderBase(nn.Module):
+    """
+    Base class for text encoding using HuggingFace Transformers models.
+    """
+    def __init__(self, model_name: str):
+        super().__init__()
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name)
+    def forward(
+        self,
+        text: list[str],
+    ):
+        device = self.model.device
+        batch = self.tokenizer(
+            text,
+            max_length=self.tokenizer.model_max_length,
+            padding=True,
+            truncation=True,
+            return_tensors="pt"
+        )
+        input_ids = batch.input_ids.to(device)
+        attention_mask = batch.attention_mask.to(device)
+        output: BaseModelOutput = self.model(
+            input_ids=input_ids, attention_mask=attention_mask
+        )
+        output = output.last_hidden_state
+        mask = (attention_mask == 1).to(device)
+        return {"output": output, "mask": mask}
+class T5TextEncoder(TransformersTextEncoderBase):
+    """
+    Text encoder using T5 encoder model.
+    """
+    def __init__(self, model_name: str = "/mnt/petrelfs/zhengzihao/cache/google-flan-t5-large"):
+        nn.Module.__init__(self)
+        self.tokenizer = T5Tokenizer.from_pretrained(model_name)
+        self.model = T5EncoderModel.from_pretrained(model_name)
+        for param in self.model.parameters():
+            param.requires_grad = False
+        self.eval()
+    def forward(
+        self,
+        text: list[str],
+    ):
+        with torch.no_grad(), torch.amp.autocast(
+            device_type=DEVICE_TYPE, enabled=False
+        ):
+            return super().forward(text)
+if __name__ == '__main__':
+    text_encoder = T5TextEncoder()
+    text = ["dog barking and cat moving"]
+    text_encoder.eval()
+    with torch.no_grad():
+        output = text_encoder(text)
+    print(output["output"].shape)
+    #print(output)

models/diffusion.py ADDED Viewed

	@@ -0,0 +1,398 @@

+from typing import Sequence
+import random
+from typing import Any
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import diffusers.schedulers as noise_schedulers
+from diffusers.schedulers.scheduling_utils import SchedulerMixin
+from diffusers.utils.torch_utils import randn_tensor
+import numpy as np
+from models.autoencoder.autoencoder_base import AutoEncoderBase
+from models.content_encoder.caption_encoder import ContentEncoder
+from models.common import LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase
+from utils.torch_utilities import (
+    create_alignment_path, create_mask_from_length, loss_with_mask,
+    trim_or_pad_length
+)
+class DiffusionMixin:
+    def __init__(
+        self,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = None,
+        classifier_free_guidance: bool = True,
+        cfg_drop_ratio: float = 0.2,
+    ) -> None:
+        self.noise_scheduler_name = noise_scheduler_name
+        self.snr_gamma = snr_gamma
+        self.classifier_free_guidance = classifier_free_guidance
+        self.cfg_drop_ratio = cfg_drop_ratio
+        self.noise_scheduler = noise_schedulers.DDIMScheduler.from_pretrained(
+            self.noise_scheduler_name, subfolder="scheduler"
+        )
+    def compute_snr(self, timesteps) -> torch.Tensor:
+        """
+        Computes SNR as per https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L847-L849
+        """
+        alphas_cumprod = self.noise_scheduler.alphas_cumprod
+        sqrt_alphas_cumprod = alphas_cumprod**0.5
+        sqrt_one_minus_alphas_cumprod = (1.0 - alphas_cumprod)**0.5
+        # Expand the tensors.
+        # Adapted from https://github.com/TiankaiHang/Min-SNR-Diffusion-Training/blob/521b624bd70c67cee4bdf49225915f5945a872e3/guided_diffusion/gaussian_diffusion.py#L1026
+        sqrt_alphas_cumprod = sqrt_alphas_cumprod.to(device=timesteps.device
+                                                    )[timesteps].float()
+        while len(sqrt_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_alphas_cumprod = sqrt_alphas_cumprod[..., None]
+        alpha = sqrt_alphas_cumprod.expand(timesteps.shape)
+        sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod.to(
+            device=timesteps.device
+        )[timesteps].float()
+        while len(sqrt_one_minus_alphas_cumprod.shape) < len(timesteps.shape):
+            sqrt_one_minus_alphas_cumprod = sqrt_one_minus_alphas_cumprod[...,
+                                                                          None]
+        sigma = sqrt_one_minus_alphas_cumprod.expand(timesteps.shape)
+        # Compute SNR.
+        snr = (alpha / sigma)**2
+        return snr
+    def get_timesteps(
+        self,
+        batch_size: int,
+        device: torch.device,
+        training: bool = True
+    ) -> torch.Tensor:
+        if training:
+            timesteps = torch.randint(
+                0,
+                self.noise_scheduler.config.num_train_timesteps,
+                (batch_size, ),
+                device=device
+            )
+        else:
+            # validation on half of the total timesteps
+            timesteps = (self.noise_scheduler.config.num_train_timesteps //
+                         2) * torch.ones((batch_size, ),
+                                         dtype=torch.int64,
+                                         device=device)
+        timesteps = timesteps.long()
+        return timesteps
+    def get_target(
+        self, latent: torch.Tensor, noise: torch.Tensor,
+        timesteps: torch.Tensor
+    ) -> torch.Tensor:
+        """
+        Get the target for loss depending on the prediction type
+        """
+        if self.noise_scheduler.config.prediction_type == "epsilon":
+            target = noise
+        elif self.noise_scheduler.config.prediction_type == "v_prediction":
+            target = self.noise_scheduler.get_velocity(
+                latent, noise, timesteps
+            )
+        else:
+            raise ValueError(
+                f"Unknown prediction type {self.noise_scheduler.config.prediction_type}"
+            )
+        return target
+    def loss_with_snr(
+        self, pred: torch.Tensor, target: torch.Tensor,
+        timesteps: torch.Tensor, mask: torch.Tensor
+    ) -> torch.Tensor:
+        if self.snr_gamma is None:
+            loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+            loss = loss_with_mask(loss, mask)
+        else:
+            # Compute loss-weights as per Section 3.4 of https://arxiv.org/abs/2303.09556.
+            # Adaptef from huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py
+            snr = self.compute_snr(timesteps)
+            mse_loss_weights = (
+                torch.stack([snr, self.snr_gamma * torch.ones_like(timesteps)],
+                            dim=1).min(dim=1)[0] / snr
+            )
+            loss = F.mse_loss(pred.float(), target.float(), reduction="none")
+            loss = loss_with_mask(loss, mask, reduce=False) * mse_loss_weights
+            loss = loss.mean()
+        return loss
+class AudioDiffusion(
+    LoadPretrainedBase, CountParamsBase, SaveTrainableParamsBase,
+    DiffusionMixin
+):
+    """
+    Args:
+        autoencoder (AutoEncoderBase): Pretrained autoencoder module VAE(frozen).
+        content_encoder (ContentEncoder): Encodes TCC and TDC information.
+        backbone (nn.Module): Main denoising network.
+        frame_resolution (float): Resolution for audio frames.
+        noise_scheduler_name (str): Noise scheduler identifier.
+        snr_gamma (float, optional): SNR gamma for noise scheduler.
+        classifier_free_guidance (bool): Enable classifier-free guidance.
+        cfg_drop_ratio (float): Ratio for randomly dropping context for classifier-free guidance.
+    """
+    def __init__(
+        self,
+        autoencoder: AutoEncoderBase,
+        content_encoder: ContentEncoder,
+        backbone: nn.Module,
+        frame_resolution:float,
+        noise_scheduler_name: str = "stabilityai/stable-diffusion-2-1",
+        snr_gamma: float = None,
+        classifier_free_guidance: bool = True,
+        cfg_drop_ratio: float = 0.2,
+    ):
+        nn.Module.__init__(self)
+        DiffusionMixin.__init__(
+            self, noise_scheduler_name, snr_gamma, classifier_free_guidance, cfg_drop_ratio
+        )
+        self.autoencoder = autoencoder
+        # Freeze autoencoder parameters
+        for param in self.autoencoder.parameters():
+            param.requires_grad = False
+        self.content_encoder = content_encoder
+        self.backbone = backbone
+        self.frame_resolution = frame_resolution
+        self.dummy_param = nn.Parameter(torch.empty(0))
+    def forward(
+        self, content: list[Any], condition: list[Any], task: list[str],
+        waveform: torch.Tensor, waveform_lengths: torch.Tensor, **kwargs
+    ):
+        """
+        Training forward pass.
+        Args:
+            content (list[Any]): List of content dicts for each sample.
+            condition (list[Any]): Conditioning information (unused here).
+            task (list[str]): List of task types.
+            waveform (Tensor): Batch of waveform tensors.
+            waveform_lengths (Tensor): Lengths for each waveform sample.
+        Returns:
+            dict: Dictionary containing the diffusion loss.
+        """
+        device = self.dummy_param.device
+        num_train_timesteps = self.noise_scheduler.config.num_train_timesteps
+        self.noise_scheduler.set_timesteps(num_train_timesteps, device=device)
+        self.autoencoder.eval()
+        with torch.no_grad():
+            latent, latent_mask = self.autoencoder.encode(
+                waveform.unsqueeze(1), waveform_lengths
+            )
+        # content(non_time_aligned_content) for TCC and time_aligned_content for TDC
+        content, content_mask, onset, _= self.content_encoder.encode_content(
+            content, device=device
+        )
+        # prepare latent and diffusion-related noise
+        time_aligned_content = onset.permute(0,2,1)
+        if self.training and self.classifier_free_guidance:
+            mask_indices = [
+                k for k in range(len(waveform)) if random.random() < self.cfg_drop_ratio
+            ]
+            if len(mask_indices) > 0:
+                content[mask_indices] = 0
+                time_aligned_content[mask_indices] = 0
+        batch_size = latent.shape[0]
+        timesteps = self.get_timesteps(batch_size, device, self.training)
+        noise = torch.randn_like(latent)
+        noisy_latent = self.noise_scheduler.add_noise(latent, noise, timesteps)
+        target = self.get_target(latent, noise, timesteps)
+        # Denoising prediction
+        pred: torch.Tensor = self.backbone(
+            x=noisy_latent,
+            timesteps=timesteps,
+            time_aligned_context=time_aligned_content,
+            context=content,
+            x_mask=latent_mask,
+            context_mask=content_mask
+        )
+        pred = pred.transpose(1, self.autoencoder.time_dim)
+        target = target.transpose(1, self.autoencoder.time_dim)
+        diff_loss = self.loss_with_snr(pred, target, timesteps, latent_mask)
+        return {
+            "diff_loss": diff_loss,
+        }
+    @torch.no_grad()
+    def inference(
+        self,
+        content: list[Any],
+        num_steps: int = 20,
+        guidance_scale: float = 3.0,
+        guidance_rescale: float = 0.0,
+        disable_progress: bool = True,
+        num_samples_per_content: int = 1,
+        **kwargs
+    ):
+        """
+        Inference/generation method for audio diffusion.
+        Args:
+            content (list[Any]): List of content dicts.
+            scheduler (SchedulerMixin): Scheduler for timesteps and noise.
+            num_steps (int): Number of denoising steps.
+            guidance_scale (float): Classifier-free guidance scale.
+            guidance_rescale (float): Rescale factor for guidance.
+            disable_progress (bool): Disable progress bar.
+            num_samples_per_content (int): How many samples to generate per content.
+        Returns:
+            waveform (Tensor): Generated waveform.
+        """
+        device = self.dummy_param.device
+        classifier_free_guidance = guidance_scale > 1.0
+        batch_size = len(content) * num_samples_per_content
+        print(content)
+        if classifier_free_guidance:
+            content, content_mask, onset, length_list = self.encode_content_classifier_free(
+                content, num_samples_per_content
+            )
+        else:
+            content, content_mask, onset, length_list = self.content_encoder.encode_content(
+            content, device=device
+        )
+            content = content.repeat_interleave(num_samples_per_content, 0)
+            content_mask = content_mask.repeat_interleave(
+                num_samples_per_content, 0
+            )
+        self.noise_scheduler.set_timesteps(num_steps, device=device)
+        timesteps = self.noise_scheduler.timesteps
+        # prepare input latent and context for the backbone
+        shape = (batch_size, 128, onset.shape[2])  # 128 for StableVAE channels
+        time_aligned_content = onset.permute(0,2,1)
+        latent = randn_tensor(
+            shape, generator=None, device=device, dtype=content.dtype
+        )
+        # scale the initial noise by the standard deviation required by the scheduler
+        latent = latent * self.noise_scheduler.init_noise_sigma
+        latent_mask = torch.full((batch_size, onset.shape[2]), False, device=device)
+        for i, length in enumerate(length_list):
+        # Set latent mask True for valid time steps for each sample
+            latent_mask[i, :length] = True
+        num_warmup_steps = len(timesteps) - num_steps * self.noise_scheduler.order
+        progress_bar = tqdm(range(num_steps), disable=disable_progress)
+        if classifier_free_guidance:
+            uncond_time_aligned_content = torch.zeros_like(
+                time_aligned_content
+            )
+            time_aligned_content = torch.cat(
+                [uncond_time_aligned_content, time_aligned_content]
+            )
+            latent_mask = torch.cat(
+                [latent_mask, latent_mask.detach().clone()]
+            )
+        # iteratively denoising
+        for i, timestep in enumerate(timesteps):
+            latent_input = torch.cat(
+                [latent, latent]
+            ) if classifier_free_guidance else latent
+            latent_input = self.noise_scheduler.scale_model_input(latent_input, timestep)
+            noise_pred = self.backbone(
+                x=latent_input,
+                x_mask=latent_mask,
+                timesteps=timestep,
+                time_aligned_context=time_aligned_content,
+                context=content,
+                context_mask=content_mask,
+            )
+            if classifier_free_guidance:
+                noise_pred_uncond, noise_pred_content = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (
+                    noise_pred_content - noise_pred_uncond
+                )
+                if guidance_rescale != 0.0:
+                    noise_pred = self.rescale_cfg(
+                        noise_pred_content, noise_pred, guidance_rescale
+                    )
+            # compute the previous noisy sample x_t -> x_t-1
+            latent = self.noise_scheduler.step(noise_pred, timestep, latent).prev_sample
+            # call the callback, if provided
+            if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and
+                                           (i+1) % self.noise_scheduler.order == 0):
+                progress_bar.update(1)
+        #latent = latent.to(next(self.autoencoder.parameters()).device)
+        waveform = self.autoencoder.decode(latent)
+        return waveform
+    def encode_content_classifier_free(
+        self,
+        content: list[Any],
+        task: list[str],
+        num_samples_per_content: int = 1
+    ):
+        device = self.dummy_param.device
+        content, content_mask, onset, length_list = self.content_encoder.encode_content(
+            content, device=device
+        )
+        content = content.repeat_interleave(num_samples_per_content, 0)
+        content_mask = content_mask.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        # get unconditional embeddings for classifier free guidance
+        uncond_content = torch.zeros_like(content)
+        uncond_content_mask = content_mask.detach().clone()
+        uncond_content = uncond_content.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        uncond_content_mask = uncond_content_mask.repeat_interleave(
+            num_samples_per_content, 0
+        )
+        # For classifier free guidance, we need to do two forward passes.
+        # We concatenate the unconditional and text embeddings into a single batch to avoid doing two forward passes
+        content = torch.cat([uncond_content, content])
+        content_mask = torch.cat([uncond_content_mask, content_mask])
+        return content, content_mask, onset, length_list
+    def rescale_cfg(
+        self, pred_cond: torch.Tensor, pred_cfg: torch.Tensor,
+        guidance_rescale: float
+    ):
+        """
+        Rescale `pred_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
+        Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
+        """
+        std_cond = pred_cond.std(
+            dim=list(range(1, pred_cond.ndim)), keepdim=True
+        )
+        std_cfg = pred_cfg.std(dim=list(range(1, pred_cfg.ndim)), keepdim=True)
+        pred_rescaled = pred_cfg * (std_cond / std_cfg)
+        pred_cfg = guidance_rescale * pred_rescaled + (
+            1 - guidance_rescale
+        ) * pred_cfg

models/dit/__pycache__/attention.cpython-310.pyc ADDED Viewed

Binary file (7.7 kB). View file

models/dit/__pycache__/audio_dit.cpython-310.pyc ADDED Viewed

Binary file (8.31 kB). View file

models/dit/__pycache__/mask_dit.cpython-310.pyc ADDED Viewed

Binary file (14.6 kB). View file

models/dit/__pycache__/modules.cpython-310.pyc ADDED Viewed

Binary file (14 kB). View file

models/dit/__pycache__/rotary.cpython-310.pyc ADDED Viewed

Binary file (2.79 kB). View file

models/dit/__pycache__/span_mask.cpython-310.pyc ADDED Viewed

Binary file (4.74 kB). View file

models/dit/attention.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.utils.checkpoint
+import einops
+from einops import rearrange, repeat
+from inspect import isfunction
+from .rotary import RotaryEmbedding
+from .modules import RMSNorm
+if hasattr(nn.functional, 'scaled_dot_product_attention'):
+    ATTENTION_MODE = 'flash'
+else:
+    ATTENTION_MODE = 'math'
+print(f'attention mode is {ATTENTION_MODE}')
+def add_mask(sim, mask):
+    b, ndim = sim.shape[0], mask.ndim
+    if ndim == 3:
+        mask = rearrange(mask, "b n m -> b 1 n m")
+    if ndim == 2:
+        mask = repeat(mask, "n m -> b 1 n m", b=b)
+    max_neg_value = -torch.finfo(sim.dtype).max
+    sim = sim.masked_fill(~mask, max_neg_value)
+    return sim
+def create_mask(q_shape, k_shape, device, q_mask=None, k_mask=None):
+    def default(val, d):
+        return val if val is not None else (d() if isfunction(d) else d)
+    b, i, j, device = q_shape[0], q_shape[-2], k_shape[-2], device
+    #print(q_mask)
+    q_mask = default(
+        q_mask, torch.ones((b, i), device=device, dtype=torch.bool)
+    )
+    k_mask = default(
+        k_mask, torch.ones((b, j), device=device, dtype=torch.bool)
+    )
+    attn_mask = rearrange(q_mask, 'b i -> b 1 i 1'
+                         ) * rearrange(k_mask, 'b j -> b 1 1 j')
+    return attn_mask
+class Attention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        rope_mode='none'
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        if context_dim is None:
+            self.cross_attn = False
+        else:
+            self.cross_attn = True
+        context_dim = dim if context_dim is None else context_dim
+        self.to_q = nn.Linear(dim, dim, bias=qkv_bias)
+        self.to_k = nn.Linear(context_dim, dim, bias=qkv_bias)
+        self.to_v = nn.Linear(context_dim, dim, bias=qkv_bias)
+        if qk_norm is None:
+            self.norm_q = nn.Identity()
+            self.norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            self.norm_q = nn.LayerNorm(head_dim)
+            self.norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            self.norm_q = RMSNorm(head_dim)
+            self.norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj = nn.Linear(dim, dim)
+        self.proj_drop = nn.Dropout(proj_drop)
+        if self.cross_attn:
+            assert rope_mode == 'none'
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = self.rotary_c(
+                q=q[:, :, :extras, :], k=k[:, :, :extras, :]
+            )
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary
+            )
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(
+                attn, mask_binary
+            ) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def forward(self, x, context=None, context_mask=None, extras=0):
+        B, L, C = x.shape
+        if context is None:
+            context = x
+        q = self.to_q(x)
+        k = self.to_k(context)
+        v = self.to_v(context)
+        if context_mask is not None:
+            mask_binary = create_mask(
+                x.shape, context.shape, x.device, None, context_mask
+            )
+        else:
+            mask_binary = None
+        q = einops.rearrange(q, 'B L (H D) -> B H L D', H=self.num_heads)
+        k = einops.rearrange(k, 'B L (H D) -> B H L D', H=self.num_heads)
+        v = einops.rearrange(v, 'B L (H D) -> B H L D', H=self.num_heads)
+        q = self.norm_q(q)
+        k = self.norm_k(k)
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+class JointAttention(nn.Module):
+    def __init__(
+        self,
+        dim,
+        num_heads=8,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        attn_drop=0.,
+        proj_drop=0.,
+        rope_mode='none'
+    ):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.scale = qk_scale or head_dim**-0.5
+        self.to_qx, self.to_kx, self.to_vx = self._make_qkv_layers(
+            dim, qkv_bias
+        )
+        self.to_qc, self.to_kc, self.to_vc = self._make_qkv_layers(
+            dim, qkv_bias
+        )
+        self.norm_qx, self.norm_kx = self._make_norm_layers(qk_norm, head_dim)
+        self.norm_qc, self.norm_kc = self._make_norm_layers(qk_norm, head_dim)
+        self.attn_drop_p = attn_drop
+        self.attn_drop = nn.Dropout(attn_drop)
+        self.proj_x = nn.Linear(dim, dim)
+        self.proj_drop_x = nn.Dropout(proj_drop)
+        self.proj_c = nn.Linear(dim, dim)
+        self.proj_drop_c = nn.Dropout(proj_drop)
+        self.rope_mode = rope_mode
+        if self.rope_mode == 'shared' or self.rope_mode == 'x_only':
+            self.rotary = RotaryEmbedding(dim=head_dim)
+        elif self.rope_mode == 'dual':
+            self.rotary_x = RotaryEmbedding(dim=head_dim)
+            self.rotary_c = RotaryEmbedding(dim=head_dim)
+    def _make_qkv_layers(self, dim, qkv_bias):
+        return (
+            nn.Linear(dim, dim,
+                      bias=qkv_bias), nn.Linear(dim, dim, bias=qkv_bias),
+            nn.Linear(dim, dim, bias=qkv_bias)
+        )
+    def _make_norm_layers(self, qk_norm, head_dim):
+        if qk_norm is None:
+            norm_q = nn.Identity()
+            norm_k = nn.Identity()
+        elif qk_norm == 'layernorm':
+            norm_q = nn.LayerNorm(head_dim)
+            norm_k = nn.LayerNorm(head_dim)
+        elif qk_norm == 'rmsnorm':
+            norm_q = RMSNorm(head_dim)
+            norm_k = RMSNorm(head_dim)
+        else:
+            raise NotImplementedError
+        return norm_q, norm_k
+    def _rotary(self, q, k, extras):
+        if self.rope_mode == 'shared':
+            q, k = self.rotary(q=q, k=k)
+        elif self.rope_mode == 'x_only':
+            q_x, k_x = self.rotary(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = q[:, :, :extras, :], k[:, :, :extras, :]
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'dual':
+            q_x, k_x = self.rotary_x(
+                q=q[:, :, extras:, :], k=k[:, :, extras:, :]
+            )
+            q_c, k_c = self.rotary_c(
+                q=q[:, :, :extras, :], k=k[:, :, :extras, :]
+            )
+            q = torch.cat((q_c, q_x), dim=2)
+            k = torch.cat((k_c, k_x), dim=2)
+        elif self.rope_mode == 'none':
+            pass
+        else:
+            raise NotImplementedError
+        return q, k
+    def _attn(self, q, k, v, mask_binary):
+        if ATTENTION_MODE == 'flash':
+            x = F.scaled_dot_product_attention(
+                q, k, v, dropout_p=self.attn_drop_p, attn_mask=mask_binary
+            )
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        elif ATTENTION_MODE == 'math':
+            attn = (q @ k.transpose(-2, -1)) * self.scale
+            attn = add_mask(
+                attn, mask_binary
+            ) if mask_binary is not None else attn
+            attn = attn.softmax(dim=-1)
+            attn = self.attn_drop(attn)
+            x = (attn @ v).transpose(1, 2)
+            x = einops.rearrange(x, 'B H L D -> B L (H D)')
+        else:
+            raise NotImplementedError
+        return x
+    def _cat_mask(self, x, context, x_mask=None, context_mask=None):
+        B = x.shape[0]
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(
+                B, context.shape[-2], device=context.device
+            ).bool()
+        mask = torch.cat([context_mask, x_mask], dim=1)
+        return mask
+    def forward(self, x, context, x_mask=None, context_mask=None, extras=0):
+        B, Lx, C = x.shape
+        _, Lc, _ = context.shape
+        if x_mask is not None or context_mask is not None:
+            mask = self._cat_mask(
+                x, context, x_mask=x_mask, context_mask=context_mask
+            )
+            shape = [B, Lx + Lc, C]
+            mask_binary = create_mask(
+                q_shape=shape,
+                k_shape=shape,
+                device=x.device,
+                q_mask=None,
+                k_mask=mask
+            )
+        else:
+            mask_binary = None
+        qx, kx, vx = self.to_qx(x), self.to_kx(x), self.to_vx(x)
+        qc, kc, vc = self.to_qc(context), self.to_kc(context
+                                                    ), self.to_vc(context)
+        qx, kx, vx = map(
+            lambda t: einops.
+            rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads),
+            [qx, kx, vx]
+        )
+        qc, kc, vc = map(
+            lambda t: einops.
+            rearrange(t, 'B L (H D) -> B H L D', H=self.num_heads),
+            [qc, kc, vc]
+        )
+        qx, kx = self.norm_qx(qx), self.norm_kx(kx)
+        qc, kc = self.norm_qc(qc), self.norm_kc(kc)
+        q, k, v = (
+            torch.cat([qc, qx],
+                      dim=2), torch.cat([kc, kx],
+                                        dim=2), torch.cat([vc, vx], dim=2)
+        )
+        q, k = self._rotary(q, k, extras)
+        x = self._attn(q, k, v, mask_binary)
+        context, x = x[:, :Lc, :], x[:, Lc:, :]
+        x = self.proj_x(x)
+        x = self.proj_drop_x(x)
+        context = self.proj_c(context)
+        context = self.proj_drop_c(context)
+        return x, context

models/dit/audio_diffsingernet_dit.py ADDED Viewed

	@@ -0,0 +1,520 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .mask_dit import DiTBlock, FinalBlock, UDiT
+from .modules import (
+    film_modulate,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    RMSNorm,
+)
+class AudioDiTBlock(DiTBlock):
+    """
+    A modified DiT block with time_aligned_context add to latent.
+    """
+    def __init__(
+        self,
+        dim,
+        time_aligned_context_dim,
+        dilation,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__(
+            dim=dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=skip,
+            skip_norm=skip_norm,
+            rope_mode=rope_mode,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        # time-aligned context projection
+        self.ta_context_projection = nn.Linear(
+            time_aligned_context_dim, 2 * dim
+        )
+        self.dilated_conv = nn.Conv1d(
+            dim, 2 * dim, kernel_size=3, padding=dilation, dilation=dilation
+        )
+    def forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+            )
+    def _forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            # TODO diffusion timestep input is not fused here
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+        # time-aligned context
+        time_aligned_context = self.ta_context_projection(time_aligned_context)
+        x = self.dilated_conv(x.transpose(1, 2)
+                             ).transpose(1, 2) + time_aligned_context
+        gate, filter = torch.chunk(x, 2, dim=-1)
+        x = torch.sigmoid(gate) * torch.tanh(filter)
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class AudioUDiT(UDiT):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        dilation_cycle_length=4,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        time_aligned_context_dim=768,
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        nn.Module.__init__(self)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        self.use_skip = skip
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        self.in_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                time_aligned_context_dim=time_aligned_context_dim,
+                dilation=2**(i % dilation_cycle_length),
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        self.mid_block = AudioDiTBlock(
+            dim=embed_dim,
+            time_aligned_context_dim=time_aligned_context_dim,
+            dilation=1,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.out_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                time_aligned_context_dim=time_aligned_context_dim,
+                dilation=2**(i % dilation_cycle_length),
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+    def forward(
+        self,
+        x,
+        timesteps,
+        time_aligned_context,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(
+            x=x,
+            time_aligned_context=time_aligned_context,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x

models/dit/audio_dit.py ADDED Viewed

	@@ -0,0 +1,549 @@

+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .mask_dit import DiTBlock, FinalBlock, UDiT
+from .modules import (
+    film_modulate,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    RMSNorm,
+)
+class AudioDiTBlock(DiTBlock):
+    """
+    A modified DiT block with time aligned context add to latent.
+    """
+    def __init__(
+        self,
+        dim,
+        ta_context_dim,
+        ta_context_norm=False,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        ta_context_fusion='add',
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__(
+            dim=dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=skip,
+            skip_norm=skip_norm,
+            rope_mode=rope_mode,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.ta_context_fusion = ta_context_fusion
+        self.ta_context_norm = ta_context_norm
+        if self.ta_context_fusion == "add":
+            self.ta_context_projection = nn.Linear(ta_context_dim, dim)
+            self.ta_context_norm = norm_layer(
+                ta_context_dim
+            ) if self.ta_context_norm else nn.Identity()
+        elif self.ta_context_fusion == "concat":
+            self.ta_context_projection = nn.Linear(ta_context_dim + dim, dim)
+            self.ta_context_norm = norm_layer(
+                ta_context_dim + dim
+            ) if self.ta_context_norm else nn.Identity()
+    def forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x,
+                time_aligned_context,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+            )
+    def _forward(
+        self,
+        x,
+        time_aligned_context,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        # # time aligned context
+        # if self.ta_context_fusion == "add":
+        #     time_aligned_context = self.ta_context_projection(
+        #         self.ta_context_norm(time_aligned_context)
+        #     )
+        #     x = x + time_aligned_context
+        # elif self.ta_context_fusion == "concat":
+        #     cat = torch.cat([x, time_aligned_context], dim=-1)
+        #     cat = self.ta_context_norm(cat)
+        #     x = self.ta_context_projection(cat)
+        # skip connection
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        #print('skip')
+        #print(x)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            # TODO diffusion timestep input is not fused here
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+        # time aligned context fusion
+        if self.ta_context_fusion == "add":
+            time_aligned_context = self.ta_context_projection(
+                self.ta_context_norm(time_aligned_context)
+            )
+            x = x + time_aligned_context
+        elif self.ta_context_fusion == "concat":
+            cat = torch.cat([x, time_aligned_context], dim=-1)
+            cat = self.ta_context_norm(cat)
+            x = self.ta_context_projection(cat)
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class AudioUDiT(UDiT):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        ta_context_dim=768,
+        ta_context_fusion='concat',
+        ta_context_norm=True,
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        nn.Module.__init__(self)
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        self.use_skip = skip
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        self.in_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                ta_context_dim=ta_context_dim,
+                ta_context_fusion=ta_context_fusion,
+                ta_context_norm=ta_context_norm,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        self.mid_block = AudioDiTBlock(
+            dim=embed_dim,
+            ta_context_dim=ta_context_dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            ta_context_fusion=ta_context_fusion,
+            ta_context_norm=ta_context_norm,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.out_blocks = nn.ModuleList([
+            AudioDiTBlock(
+                dim=embed_dim,
+                ta_context_dim=ta_context_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                ta_context_fusion=ta_context_fusion,
+                ta_context_norm=ta_context_norm,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for i in range(depth // 2)
+        ])
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+    def forward(
+        self,
+        x,
+        timesteps,
+        time_aligned_context,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(
+            x=x,
+            time_aligned_context=time_aligned_context,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+            x = blk(
+                x=x,
+                time_aligned_context=time_aligned_context,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x

models/dit/mask_dit.py ADDED Viewed

	@@ -0,0 +1,823 @@

+import logging
+import math
+import torch
+import torch.nn as nn
+from torch.utils.checkpoint import checkpoint
+from .modules import (
+    film_modulate,
+    unpatchify,
+    PatchEmbed,
+    PE_wrapper,
+    TimestepEmbedder,
+    FeedForward,
+    RMSNorm,
+)
+from .span_mask import compute_mask_indices
+from .attention import Attention
+logger = logging.Logger(__file__)
+class AdaLN(nn.Module):
+    def __init__(self, dim, ada_mode='ada', r=None, alpha=None):
+        super().__init__()
+        self.ada_mode = ada_mode
+        self.scale_shift_table = None
+        if ada_mode == 'ada':
+            # move nn.silu outside
+            self.time_ada = nn.Linear(dim, 6 * dim, bias=True)
+        elif ada_mode == 'ada_single':
+            # adaln used in pixel-art alpha
+            self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        elif ada_mode in ['ada_solo', 'ada_sola_bias']:
+            self.lora_a = nn.Linear(dim, r * 6, bias=False)
+            self.lora_b = nn.Linear(r * 6, dim * 6, bias=False)
+            self.scaling = alpha / r
+            if ada_mode == 'ada_sola_bias':
+                # take bias out for consistency
+                self.scale_shift_table = nn.Parameter(torch.zeros(6, dim))
+        else:
+            raise NotImplementedError
+    def forward(self, time_token=None, time_ada=None):
+        if self.ada_mode == 'ada':
+            assert time_ada is None
+            B = time_token.shape[0]
+            time_ada = self.time_ada(time_token).reshape(B, 6, -1)
+        elif self.ada_mode == 'ada_single':
+            B = time_ada.shape[0]
+            time_ada = time_ada.reshape(B, 6, -1)
+            time_ada = self.scale_shift_table[None] + time_ada
+        elif self.ada_mode in ['ada_sola', 'ada_sola_bias']:
+            B = time_ada.shape[0]
+            time_ada_lora = self.lora_b(self.lora_a(time_token)) * self.scaling
+            time_ada = time_ada + time_ada_lora
+            time_ada = time_ada.reshape(B, 6, -1)
+            if self.scale_shift_table is not None:
+                time_ada = self.scale_shift_table[None] + time_ada
+        else:
+            raise NotImplementedError
+        return time_ada
+class DiTBlock(nn.Module):
+    """
+    A modified PixArt block with adaptive layer norm (adaLN-single) conditioning.
+    """
+    def __init__(
+        self,
+        dim,
+        context_dim=None,
+        num_heads=8,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer=nn.LayerNorm,
+        time_fusion='none',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        skip=False,
+        skip_norm=False,
+        rope_mode='none',
+        context_norm=False,
+        use_checkpoint=False
+    ):
+        super().__init__()
+        self.norm1 = norm_layer(dim)
+        self.attn = Attention(
+            dim=dim,
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            rope_mode=rope_mode
+        )
+        if context_dim is not None:
+            self.use_context = True
+            self.cross_attn = Attention(
+                dim=dim,
+                num_heads=num_heads,
+                context_dim=context_dim,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                rope_mode='none'
+            )
+            self.norm2 = norm_layer(dim)
+            if context_norm:
+                self.norm_context = norm_layer(context_dim)
+            else:
+                self.norm_context = nn.Identity()
+        else:
+            self.use_context = False
+        self.norm3 = norm_layer(dim)
+        self.mlp = FeedForward(
+            dim=dim, mult=mlp_ratio, activation_fn=act_layer, dropout=0
+        )
+        self.use_adanorm = True if time_fusion != 'token' else False
+        if self.use_adanorm:
+            self.adaln = AdaLN(
+                dim,
+                ada_mode=time_fusion,
+                r=ada_sola_rank,
+                alpha=ada_sola_alpha
+            )
+        if skip:
+            self.skip_norm = norm_layer(2 *
+                                        dim) if skip_norm else nn.Identity()
+            self.skip_linear = nn.Linear(2 * dim, dim)
+        else:
+            self.skip_linear = None
+        self.use_checkpoint = use_checkpoint
+    def forward(
+        self,
+        x,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        if self.use_checkpoint:
+            return checkpoint(
+                self._forward,
+                x,
+                time_token,
+                time_ada,
+                skip,
+                context,
+                x_mask,
+                context_mask,
+                extras,
+                use_reentrant=False
+            )
+        else:
+            return self._forward(
+                x, time_token, time_ada, skip, context, x_mask, context_mask,
+                extras
+            )
+    def _forward(
+        self,
+        x,
+        time_token=None,
+        time_ada=None,
+        skip=None,
+        context=None,
+        x_mask=None,
+        context_mask=None,
+        extras=None
+    ):
+        B, T, C = x.shape
+        if self.skip_linear is not None:
+            assert skip is not None
+            cat = torch.cat([x, skip], dim=-1)
+            cat = self.skip_norm(cat)
+            x = self.skip_linear(cat)
+        if self.use_adanorm:
+            time_ada = self.adaln(time_token, time_ada)
+            (shift_msa, scale_msa, gate_msa, shift_mlp, scale_mlp,
+             gate_mlp) = time_ada.chunk(6, dim=1)
+        # self attention
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm1(x), shift=shift_msa, scale=scale_msa
+            )
+            x = x + (1-gate_msa) * self.attn(
+                x_norm, context=None, context_mask=x_mask, extras=extras
+            )
+        else:
+            x = x + self.attn(
+                self.norm1(x),
+                context=None,
+                context_mask=x_mask,
+                extras=extras
+            )
+        # cross attention
+        if self.use_context:
+            assert context is not None
+            x = x + self.cross_attn(
+                x=self.norm2(x),
+                context=self.norm_context(context),
+                context_mask=context_mask,
+                extras=extras
+            )
+        # mlp
+        if self.use_adanorm:
+            x_norm = film_modulate(
+                self.norm3(x), shift=shift_mlp, scale=scale_mlp
+            )
+            x = x + (1-gate_mlp) * self.mlp(x_norm)
+        else:
+            x = x + self.mlp(self.norm3(x))
+        return x
+class FinalBlock(nn.Module):
+    def __init__(
+        self,
+        embed_dim,
+        patch_size,
+        in_chans,
+        img_size,
+        input_type='2d',
+        norm_layer=nn.LayerNorm,
+        use_conv=True,
+        use_adanorm=True
+    ):
+        super().__init__()
+        self.in_chans = in_chans
+        self.img_size = img_size
+        self.input_type = input_type
+        self.norm = norm_layer(embed_dim)
+        if use_adanorm:
+            self.use_adanorm = True
+        else:
+            self.use_adanorm = False
+        if input_type == '2d':
+            self.patch_dim = patch_size**2 * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv2d(
+                    self.in_chans, self.in_chans, 3, padding=1
+                )
+            else:
+                self.final_layer = nn.Identity()
+        elif input_type == '1d':
+            self.patch_dim = patch_size * in_chans
+            self.linear = nn.Linear(embed_dim, self.patch_dim, bias=True)
+            if use_conv:
+                self.final_layer = nn.Conv1d(
+                    self.in_chans, self.in_chans, 3, padding=1
+                )
+            else:
+                self.final_layer = nn.Identity()
+    def forward(self, x, time_ada=None, extras=0):
+        B, T, C = x.shape
+        x = x[:, extras:, :]
+        # only handle generation target
+        if self.use_adanorm:
+            shift, scale = time_ada.reshape(B, 2, -1).chunk(2, dim=1)
+            x = film_modulate(self.norm(x), shift, scale)
+        else:
+            x = self.norm(x)
+        x = self.linear(x)
+        x = unpatchify(x, self.in_chans, self.input_type, self.img_size)
+        x = self.final_layer(x)
+        return x
+class UDiT(nn.Module):
+    def __init__(
+        self,
+        img_size=224,
+        patch_size=16,
+        in_chans=3,
+        input_type='2d',
+        out_chans=None,
+        embed_dim=768,
+        depth=12,
+        num_heads=12,
+        mlp_ratio=4.,
+        qkv_bias=False,
+        qk_scale=None,
+        qk_norm=None,
+        act_layer='gelu',
+        norm_layer='layernorm',
+        context_norm=False,
+        use_checkpoint=False,
+        # time fusion ada or token
+        time_fusion='token',
+        ada_sola_rank=None,
+        ada_sola_alpha=None,
+        cls_dim=None,
+        # max length is only used for concat
+        context_dim=768,
+        context_fusion='concat',
+        context_max_length=128,
+        context_pe_method='sinu',
+        pe_method='abs',
+        rope_mode='none',
+        use_conv=True,
+        skip=True,
+        skip_norm=True
+    ):
+        super().__init__()
+        self.num_features = self.embed_dim = embed_dim  # num_features for consistency with other models
+        # input
+        self.in_chans = in_chans
+        self.input_type = input_type
+        if self.input_type == '2d':
+            num_patches = (img_size[0] //
+                           patch_size) * (img_size[1] // patch_size)
+        elif self.input_type == '1d':
+            num_patches = img_size // patch_size
+        self.patch_embed = PatchEmbed(
+            patch_size=patch_size,
+            in_chans=in_chans,
+            embed_dim=embed_dim,
+            input_type=input_type
+        )
+        out_chans = in_chans if out_chans is None else out_chans
+        self.out_chans = out_chans
+        # position embedding
+        self.rope = rope_mode
+        self.x_pe = PE_wrapper(
+            dim=embed_dim, method=pe_method, length=num_patches
+        )
+        logger.info(f'x position embedding: {pe_method}')
+        logger.info(f'rope mode: {self.rope}')
+        # time embed
+        self.time_embed = TimestepEmbedder(embed_dim)
+        self.time_fusion = time_fusion
+        self.use_adanorm = False
+        # cls embed
+        if cls_dim is not None:
+            self.cls_embed = nn.Sequential(
+                nn.Linear(cls_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+        else:
+            self.cls_embed = None
+        # time fusion
+        if time_fusion == 'token':
+            # put token at the beginning of sequence
+            self.extras = 2 if self.cls_embed else 1
+            self.time_pe = PE_wrapper(
+                dim=embed_dim, method='abs', length=self.extras
+            )
+        elif time_fusion in ['ada', 'ada_single', 'ada_sola', 'ada_sola_bias']:
+            self.use_adanorm = True
+            # aviod  repetitive silu for each adaln block
+            self.time_act = nn.SiLU()
+            self.extras = 0
+            self.time_ada_final = nn.Linear(
+                embed_dim, 2 * embed_dim, bias=True
+            )
+            if time_fusion in ['ada_single', 'ada_sola', 'ada_sola_bias']:
+                # shared adaln
+                self.time_ada = nn.Linear(embed_dim, 6 * embed_dim, bias=True)
+            else:
+                self.time_ada = None
+        else:
+            raise NotImplementedError
+        logger.info(f'time fusion mode: {self.time_fusion}')
+        # context
+        # use a simple projection
+        self.use_context = False
+        self.context_cross = False
+        self.context_max_length = context_max_length
+        self.context_fusion = 'none'
+        if context_dim is not None:
+            self.use_context = True
+            self.context_embed = nn.Sequential(
+                nn.Linear(context_dim, embed_dim, bias=True),
+                nn.SiLU(),
+                nn.Linear(embed_dim, embed_dim, bias=True),
+            )
+            self.context_fusion = context_fusion
+            if context_fusion == 'concat' or context_fusion == 'joint':
+                self.extras += context_max_length
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                # no cross attention layers
+                context_dim = None
+            elif context_fusion == 'cross':
+                self.context_pe = PE_wrapper(
+                    dim=embed_dim,
+                    method=context_pe_method,
+                    length=context_max_length
+                )
+                self.context_cross = True
+                context_dim = embed_dim
+            else:
+                raise NotImplementedError
+        logger.info(f'context fusion mode: {context_fusion}')
+        logger.info(f'context position embedding: {context_pe_method}')
+        self.use_skip = skip
+        # norm layers
+        if norm_layer == 'layernorm':
+            norm_layer = nn.LayerNorm
+        elif norm_layer == 'rmsnorm':
+            norm_layer = RMSNorm
+        else:
+            raise NotImplementedError
+        logger.info(f'use long skip connection: {skip}')
+        self.in_blocks = nn.ModuleList([
+            DiTBlock(
+                dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=False,
+                skip_norm=False,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for _ in range(depth // 2)
+        ])
+        self.mid_block = DiTBlock(
+            dim=embed_dim,
+            context_dim=context_dim,
+            num_heads=num_heads,
+            mlp_ratio=mlp_ratio,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            qk_norm=qk_norm,
+            act_layer=act_layer,
+            norm_layer=norm_layer,
+            time_fusion=time_fusion,
+            ada_sola_rank=ada_sola_rank,
+            ada_sola_alpha=ada_sola_alpha,
+            skip=False,
+            skip_norm=False,
+            rope_mode=self.rope,
+            context_norm=context_norm,
+            use_checkpoint=use_checkpoint
+        )
+        self.out_blocks = nn.ModuleList([
+            DiTBlock(
+                dim=embed_dim,
+                context_dim=context_dim,
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                qk_norm=qk_norm,
+                act_layer=act_layer,
+                norm_layer=norm_layer,
+                time_fusion=time_fusion,
+                ada_sola_rank=ada_sola_rank,
+                ada_sola_alpha=ada_sola_alpha,
+                skip=skip,
+                skip_norm=skip_norm,
+                rope_mode=self.rope,
+                context_norm=context_norm,
+                use_checkpoint=use_checkpoint
+            ) for _ in range(depth // 2)
+        ])
+        # FinalLayer block
+        self.use_conv = use_conv
+        self.final_block = FinalBlock(
+            embed_dim=embed_dim,
+            patch_size=patch_size,
+            img_size=img_size,
+            in_chans=out_chans,
+            input_type=input_type,
+            norm_layer=norm_layer,
+            use_conv=use_conv,
+            use_adanorm=self.use_adanorm
+        )
+        self.initialize_weights()
+    def _init_ada(self):
+        if self.time_fusion == 'ada':
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.weight, 0)
+            nn.init.constant_(self.mid_block.adaln.time_ada.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.adaln.time_ada.weight, 0)
+                nn.init.constant_(block.adaln.time_ada.bias, 0)
+        elif self.time_fusion == 'ada_single':
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+        elif self.time_fusion in ['ada_sola', 'ada_sola_bias']:
+            nn.init.constant_(self.time_ada.weight, 0)
+            nn.init.constant_(self.time_ada.bias, 0)
+            nn.init.constant_(self.time_ada_final.weight, 0)
+            nn.init.constant_(self.time_ada_final.bias, 0)
+            for block in self.in_blocks:
+                nn.init.kaiming_uniform_(
+                    block.adaln.lora_a.weight, a=math.sqrt(5)
+                )
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+            nn.init.kaiming_uniform_(
+                self.mid_block.adaln.lora_a.weight, a=math.sqrt(5)
+            )
+            nn.init.constant_(self.mid_block.adaln.lora_b.weight, 0)
+            for block in self.out_blocks:
+                nn.init.kaiming_uniform_(
+                    block.adaln.lora_a.weight, a=math.sqrt(5)
+                )
+                nn.init.constant_(block.adaln.lora_b.weight, 0)
+    def initialize_weights(self):
+        # Basic init for all layers
+        def _basic_init(module):
+            if isinstance(module, nn.Linear):
+                torch.nn.init.xavier_uniform_(module.weight)
+                if module.bias is not None:
+                    nn.init.constant_(module.bias, 0)
+        self.apply(_basic_init)
+        # init patch Conv like Linear
+        w = self.patch_embed.proj.weight.data
+        nn.init.xavier_uniform_(w.view([w.shape[0], -1]))
+        nn.init.constant_(self.patch_embed.proj.bias, 0)
+        # Zero-out AdaLN
+        if self.use_adanorm:
+            self._init_ada()
+        # Zero-out Cross Attention
+        if self.context_cross:
+            for block in self.in_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.weight, 0)
+            nn.init.constant_(self.mid_block.cross_attn.proj.bias, 0)
+            for block in self.out_blocks:
+                nn.init.constant_(block.cross_attn.proj.weight, 0)
+                nn.init.constant_(block.cross_attn.proj.bias, 0)
+        # Zero-out cls embedding
+        if self.cls_embed:
+            if self.use_adanorm:
+                nn.init.constant_(self.cls_embed[-1].weight, 0)
+                nn.init.constant_(self.cls_embed[-1].bias, 0)
+        # Zero-out Output
+        # might not zero-out this when using v-prediction
+        # it could be good when using noise-prediction
+        # nn.init.constant_(self.final_block.linear.weight, 0)
+        # nn.init.constant_(self.final_block.linear.bias, 0)
+        # if self.use_conv:
+        #     nn.init.constant_(self.final_block.final_layer.weight.data, 0)
+        #     nn.init.constant_(self.final_block.final_layer.bias, 0)
+        # init out Conv
+        if self.use_conv:
+            nn.init.xavier_uniform_(self.final_block.final_layer.weight)
+            nn.init.constant_(self.final_block.final_layer.bias, 0)
+    def _concat_x_context(self, x, context, x_mask=None, context_mask=None):
+        assert context.shape[-2] == self.context_max_length
+        # Check if either x_mask or context_mask is provided
+        B = x.shape[0]
+        # Create default masks if they are not provided
+        if x_mask is None:
+            x_mask = torch.ones(B, x.shape[-2], device=x.device).bool()
+        if context_mask is None:
+            context_mask = torch.ones(
+                B, context.shape[-2], device=context.device
+            ).bool()
+        # Concatenate the masks along the second dimension (dim=1)
+        x_mask = torch.cat([context_mask, x_mask], dim=1)
+        # Concatenate context and x along the second dimension (dim=1)
+        x = torch.cat((context, x), dim=1)
+        return x, x_mask
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        controlnet_skips=None,
+    ):
+        # make it compatible with int time step during inference
+        if timesteps.dim() == 0:
+            timesteps = timesteps.expand(x.shape[0]
+                                        ).to(x.device, dtype=torch.long)
+        x = self.patch_embed(x)
+        x = self.x_pe(x)
+        B, L, D = x.shape
+        if self.use_context:
+            context_token = self.context_embed(context)
+            context_token = self.context_pe(context_token)
+            if self.context_fusion == 'concat' or self.context_fusion == 'joint':
+                x, x_mask = self._concat_x_context(
+                    x=x,
+                    context=context_token,
+                    x_mask=x_mask,
+                    context_mask=context_mask
+                )
+                context_token, context_mask = None, None
+        else:
+            context_token, context_mask = None, None
+        time_token = self.time_embed(timesteps)
+        if self.cls_embed:
+            cls_token = self.cls_embed(cls_token)
+        time_ada = None
+        time_ada_final = None
+        if self.use_adanorm:
+            if self.cls_embed:
+                time_token = time_token + cls_token
+            time_token = self.time_act(time_token)
+            time_ada_final = self.time_ada_final(time_token)
+            if self.time_ada is not None:
+                time_ada = self.time_ada(time_token)
+        else:
+            time_token = time_token.unsqueeze(dim=1)
+            if self.cls_embed:
+                cls_token = cls_token.unsqueeze(dim=1)
+                time_token = torch.cat([time_token, cls_token], dim=1)
+            time_token = self.time_pe(time_token)
+            x = torch.cat((time_token, x), dim=1)
+            if x_mask is not None:
+                x_mask = torch.cat([
+                    torch.ones(B, time_token.shape[1],
+                               device=x_mask.device).bool(), x_mask
+                ],
+                                   dim=1)
+            time_token = None
+        skips = []
+        for blk in self.in_blocks:
+            x = blk(
+                x=x,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=None,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+            if self.use_skip:
+                skips.append(x)
+        x = self.mid_block(
+            x=x,
+            time_token=time_token,
+            time_ada=time_ada,
+            skip=None,
+            context=context_token,
+            x_mask=x_mask,
+            context_mask=context_mask,
+            extras=self.extras
+        )
+        for blk in self.out_blocks:
+            if self.use_skip:
+                skip = skips.pop()
+                if controlnet_skips:
+                    # add to skip like u-net controlnet
+                    skip = skip + controlnet_skips.pop()
+            else:
+                skip = None
+                if controlnet_skips:
+                    # directly add to x
+                    x = x + controlnet_skips.pop()
+            x = blk(
+                x=x,
+                time_token=time_token,
+                time_ada=time_ada,
+                skip=skip,
+                context=context_token,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                extras=self.extras
+            )
+        x = self.final_block(x, time_ada=time_ada_final, extras=self.extras)
+        return x
+class MaskDiT(nn.Module):
+    def __init__(
+        self,
+        model: UDiT,
+        mae=False,
+        mae_prob=0.5,
+        mask_ratio=[0.25, 1.0],
+        mask_span=10,
+    ):
+        super().__init__()
+        self.model = model
+        self.mae = mae
+        if self.mae:
+            out_channel = model.out_chans
+            self.mask_embed = nn.Parameter(torch.zeros((out_channel)))
+            self.mae_prob = mae_prob
+            self.mask_ratio = mask_ratio
+            self.mask_span = mask_span
+    def random_masking(self, gt, mask_ratios, mae_mask_infer=None):
+        B, D, L = gt.shape
+        if mae_mask_infer is None:
+            # mask = torch.rand(B, L).to(gt.device) < mask_ratios.unsqueeze(1)
+            mask_ratios = mask_ratios.cpu().numpy()
+            mask = compute_mask_indices(
+                shape=[B, L],
+                padding_mask=None,
+                mask_prob=mask_ratios,
+                mask_length=self.mask_span,
+                mask_type="static",
+                mask_other=0.0,
+                min_masks=1,
+                no_overlap=False,
+                min_space=0,
+            )
+            mask = mask.unsqueeze(1).expand_as(gt)
+        else:
+            mask = mae_mask_infer
+            mask = mask.expand_as(gt)
+        gt[mask] = self.mask_embed.view(1, D, 1).expand_as(gt)[mask]
+        return gt, mask.type_as(gt)
+    def forward(
+        self,
+        x,
+        timesteps,
+        context,
+        x_mask=None,
+        context_mask=None,
+        cls_token=None,
+        gt=None,
+        mae_mask_infer=None,
+        forward_model=True
+    ):
+        # todo: handle controlnet inside
+        mae_mask = torch.ones_like(x)
+        if self.mae:
+            if gt is not None:
+                B, D, L = gt.shape
+                mask_ratios = torch.FloatTensor(B).uniform_(*self.mask_ratio
+                                                           ).to(gt.device)
+                gt, mae_mask = self.random_masking(
+                    gt, mask_ratios, mae_mask_infer
+                )
+                # apply mae only to the selected batches
+                if mae_mask_infer is None:
+                    # determine mae batch
+                    mae_batch = torch.rand(B) < self.mae_prob
+                    gt[~mae_batch] = self.mask_embed.view(
+                        1, D, 1
+                    ).expand_as(gt)[~mae_batch]
+                    mae_mask[~mae_batch] = 1.0
+            else:
+                B, D, L = x.shape
+                gt = self.mask_embed.view(1, D, 1).expand_as(x)
+            x = torch.cat([x, gt, mae_mask[:, 0:1, :]], dim=1)
+        if forward_model:
+            x = self.model(
+                x=x,
+                timesteps=timesteps,
+                context=context,
+                x_mask=x_mask,
+                context_mask=context_mask,
+                cls_token=cls_token
+            )
+            # logger.info(mae_mask[:, 0, :].sum(dim=-1))
+        return x, mae_mask