Spaces:

noblebarkrr
/

mvsepless_zero_gpu

Paused

App Files Files Community

noblebarkrr commited on 26 days ago

Commit

09d1dfb

verified ·

1 Parent(s): 51c079b

Upload 15 files

Browse files

Files changed (14) hide show

models/bs_roformer/__init__.py +11 -11
models/bs_roformer/attend.py +150 -127
models/bs_roformer/bs_conformer.py +5 -27
models/bs_roformer/bs_roformer.py +767 -766
models/bs_roformer/bs_roformer_conditional.py +9 -41
models/bs_roformer/bs_roformer_fno.py +685 -704
models/bs_roformer/bs_roformer_hyperace.py +1103 -1122
models/bs_roformer/bs_roformer_hyperace2.py +1147 -1166
models/bs_roformer/bs_roformer_sw.py +657 -676
models/bs_roformer/bs_roformer_unwa_inst_large_2.py +8 -31
models/bs_roformer/bs_siamese_roformer.py +7 -30
models/bs_roformer/fno1d.py +0 -0
models/bs_roformer/mel_band_conformer.py +2 -26
models/bs_roformer/mel_band_roformer.py +748 -747

models/bs_roformer/__init__.py CHANGED Viewed

@@ -1,11 +1,11 @@
-from .bs_roformer import BSRoformer
-from .bs_conformer import BSConformer
-from .bs_roformer_sw import BSRoformer_SW
-from .bs_roformer_fno import BSRoformer_FNO
-from .bs_roformer_hyperace import BSRoformerHyperACE
-from .bs_roformer_hyperace2 import BSRoformerHyperACE_2
-from .bs_roformer_conditional import BSRoformer_Conditional
-from .bs_roformer_unwa_inst_large_2 import BSRoformer_2
-from .bs_siamese_roformer import BSSiameseRoformer
-from .mel_band_roformer import MelBandRoformer
-from .mel_band_conformer import MelBandConformer

+from .bs_roformer import BSRoformer
+from .bs_conformer import BSConformer
+from .bs_roformer_sw import BSRoformer_SW
+from .bs_roformer_fno import BSRoformer_FNO
+from .bs_roformer_hyperace import BSRoformerHyperACE
+from .bs_roformer_hyperace2 import BSRoformerHyperACE_2
+from .bs_roformer_conditional import BSRoformer_Conditional
+from .bs_roformer_unwa_inst_large_2 import BSRoformer_2
+from .bs_siamese_roformer import BSSiameseRoformer
+from .mel_band_roformer import MelBandRoformer
+from .mel_band_conformer import MelBandConformer

models/bs_roformer/attend.py CHANGED Viewed

@@ -1,128 +1,151 @@
-from functools import wraps
-from packaging import version
-from collections import namedtuple
-import os
-import torch
-from torch import nn, einsum
-import torch.nn.functional as F
-from einops import rearrange, reduce
-FlashAttentionConfig = namedtuple(
-    "FlashAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"]
-)
-def exists(val):
-    return val is not None
-def default(v, d):
-    return v if exists(v) else d
-def once(fn):
-    called = False
-    @wraps(fn)
-    def inner(x):
-        nonlocal called
-        if called:
-            return
-        called = True
-        return fn(x)
-    return inner
-print_once = once(print)
-class Attend(nn.Module):
-    def __init__(self, dropout=0.0, flash=False, scale=None):
-        super().__init__()
-        self.scale = scale
-        self.dropout = dropout
-        self.attn_dropout = nn.Dropout(dropout)
-        self.flash = flash
-        self.use_torch_2_sdpa = False
-        self._config_checked = False
-        # Проверяем версию PyTorch при первом вызове
-        if flash and not self._config_checked:
-            if version.parse(torch.__version__) >= version.parse("2.0.0"):
-                print_once("PyTorch >= 2.0 detected, will use SDPA if available.")
-                self.use_torch_2_sdpa = True
-                # Настройки для PyTorch >= 2.0
-                self.cpu_config = FlashAttentionConfig(True, True, True)
-                self.cuda_config = None
-                if torch.cuda.is_available():
-                    device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
-                    device_version = version.parse(
-                        f"{device_properties.major}.{device_properties.minor}"
-                    )
-                    if device_version >= version.parse("8.0"):
-                        if os.name == "nt":
-                            print_once(
-                                "Windows OS detected, using math or mem efficient attention if input tensor is on cuda"
-                            )
-                            self.cuda_config = FlashAttentionConfig(False, True, True)
-                        else:
-                            print_once(
-                                "GPU Compute Capability equal or above 8.0, using flash attention if input tensor is on cuda"
-                            )
-                            self.cuda_config = FlashAttentionConfig(True, False, False)
-                    else:
-                        print_once(
-                            "GPU Compute Capability below 8.0, using math or mem efficient attention if input tensor is on cuda"
-                        )
-                        self.cuda_config = FlashAttentionConfig(False, True, True)
-            else:
-                print_once("PyTorch < 2.0 detected, flash attention will use einsum fallback.")
-                self.use_torch_2_sdpa = False
-            self._config_checked = True
-    def flash_attn_torch2(self, q, k, v):
-        """SDPA для PyTorch >= 2.0"""
-        if exists(self.scale):
-            default_scale = q.shape[-1] ** -0.5
-            q = q * (self.scale / default_scale)
-        is_cuda = q.is_cuda
-        config = self.cuda_config if is_cuda else self.cpu_config
-        with torch.backends.cuda.sdp_kernel(**config._asdict()):
-            out = F.scaled_dot_product_attention(
-                q, k, v, dropout_p=self.dropout if self.training else 0.0
-            )
-        return out
-    def forward(self, q, k, v):
-        q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
-        scale = default(self.scale, q.shape[-1] ** -0.5)
-        if self.flash and self.use_torch_2_sdpa:
-            try:
-                return self.flash_attn_torch2(q, k, v)
-            except Exception as e:
-                print(f"Flash attention failed: {e}. Falling back to einsum.")
-                self.use_torch_2_sdpa = False
-        # Fallback для PyTorch < 2.0 или если flash отключен
-        sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale
-        attn = sim.softmax(dim=-1)
-        attn = self.attn_dropout(attn)
-        out = einsum(f"b h i j, b h j d -> b h i d", attn, v)
         return out

+from functools import wraps
+from packaging import version
+from collections import namedtuple
+import os
+import torch
+from torch import nn, einsum
+import torch.nn.functional as F
+from einops import rearrange, reduce
+FlashAttentionConfig = namedtuple(
+    "FlashAttentionConfig", ["enable_flash", "enable_math", "enable_mem_efficient"]
+)
+def exists(val):
+    return val is not None
+def default(v, d):
+    return v if exists(v) else d
+def once(fn):
+    called = False
+    @wraps(fn)
+    def inner(x):
+        nonlocal called
+        if called:
+            return
+        called = True
+        return fn(x)
+    return inner
+print_once = once(print)
+class Attend(nn.Module):
+    def __init__(self, dropout=0.0, flash=False, scale=None):
+        super().__init__()
+        self.scale = scale
+        self.dropout = dropout
+        self.attn_dropout = nn.Dropout(dropout)
+        self.flash = flash
+        self.use_torch_2_sdpa = False
+        self._config_checked = False
+        # Проверяем версию PyTorch при первом вызове
+        if flash and not self._config_checked:
+            if version.parse(torch.__version__) >= version.parse("2.0.0"):
+                print_once("PyTorch >= 2.0 detected, will use SDPA if available.")
+                self.use_torch_2_sdpa = True
+                # Настройки для PyTorch >= 2.0
+                self.cpu_config = FlashAttentionConfig(True, True, True)
+                self.cuda_config = None
+                if torch.cuda.is_available():
+                    device_properties = torch.cuda.get_device_properties(torch.device("cuda"))
+                    device_version = version.parse(
+                        f"{device_properties.major}.{device_properties.minor}"
+                    )
+                    if device_version >= version.parse("8.0"):
+                        if os.name == "nt":
+                            print_once(
+                                "Windows OS detected, using math or mem efficient attention if input tensor is on cuda"
+                            )
+                            self.cuda_config = FlashAttentionConfig(False, True, True)
+                        else:
+                            print_once(
+                                "GPU Compute Capability equal or above 8.0, using flash attention if input tensor is on cuda"
+                            )
+                            self.cuda_config = FlashAttentionConfig(True, False, False)
+                    else:
+                        print_once(
+                            "GPU Compute Capability below 8.0, using math or mem efficient attention if input tensor is on cuda"
+                        )
+                        self.cuda_config = FlashAttentionConfig(False, True, True)
+            else:
+                print_once("PyTorch < 2.0 detected, flash attention will use einsum fallback.")
+                self.use_torch_2_sdpa = False
+            self._config_checked = True
+    def flash_attn_torch2(self, q, k, v):
+        """SDPA для PyTorch >= 2.0"""
+        if exists(self.scale):
+            default_scale = q.shape[-1] ** -0.5
+            q = q * (self.scale / default_scale)
+        is_cuda = q.is_cuda
+        config = self.cuda_config if is_cuda else self.cpu_config
+        old_sdp_kernel = False
+        if hasattr(torch, "backends"):
+            if hasattr(torch.backends, "cuda"):
+                if hasattr(torch.backends.cuda, "sdp_kernel"):
+                    old_sdp_kernel = True
+        new_sdp_kernel = False
+        if hasattr(torch, "nn"):
+            if hasattr(torch.nn, "attention"):
+                if hasattr(torch.nn.attention, "sdpa_kernel") and hasattr(torch.nn.attention, "SDPBackend"):
+                    new_sdp_kernel = True
+        if old_sdp_kernel and not new_sdp_kernel:
+            with torch.backends.cuda.sdp_kernel(**config._asdict()):
+                out = F.scaled_dot_product_attention(
+                    q, k, v, dropout_p=self.dropout if self.training else 0.0
+                )
+        else:
+            backends = []
+            if config.enable_flash:
+                backends.append(torch.nn.attention.SDPBackend.FLASH_ATTENTION)
+            if config.enable_mem_efficient:
+                backends.append(torch.nn.attention.SDPBackend.EFFICIENT_ATTENTION)
+            if config.enable_math:
+                backends.append(torch.nn.attention.SDPBackend.MATH)
+            with torch.nn.attention.sdpa_kernel(backends):
+                out = F.scaled_dot_product_attention(
+                    q, k, v, dropout_p=self.dropout if self.training else 0.0
+                )
+        return out
+    def forward(self, q, k, v):
+        q_len, k_len, device = q.shape[-2], k.shape[-2], q.device
+        scale = default(self.scale, q.shape[-1] ** -0.5)
+        if self.flash and self.use_torch_2_sdpa:
+            try:
+                return self.flash_attn_torch2(q, k, v)
+            except Exception as e:
+                print(f"Flash attention failed: {e}. Falling back to einsum.")
+                self.use_torch_2_sdpa = False
+        # Fallback для PyTorch < 2.0 или если flash отключен
+        sim = einsum(f"b h i d, b h j d -> b h i j", q, k) * scale
+        attn = sim.softmax(dim=-1)
+        attn = self.attn_dropout(attn)
+        out = einsum(f"b h i j, b h j d -> b h i d", attn, v)
         return out

models/bs_roformer/bs_conformer.py CHANGED Viewed

@@ -6,10 +6,6 @@ from torch.nn import Module, ModuleList
 import torch.nn.functional as F
 from .attend import Attend
-try:
-    from .attend_sage import Attend as AttendSage
-except:
-    pass
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
@@ -95,7 +91,6 @@ class Attention(Module):
         dropout=0.,
         rotary_embed=None,
         flash=True,
-        sage_attention=False,
     ):
         super().__init__()
         self.heads = heads
@@ -104,10 +99,7 @@ class Attention(Module):
         self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
@@ -151,7 +143,7 @@ class LinearAttention(Module):
         scale=8,
         flash=True,
         dropout=0.,
-        sage_attention=False
     ):
         super().__init__()
         dim_inner = dim_head * heads
@@ -164,10 +156,7 @@ class LinearAttention(Module):
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(scale=scale, dropout=dropout, flash=flash)
-        else:
-            self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
@@ -200,7 +189,6 @@ class Transformer(Module):
         rotary_embed=None,
         flash_attn=True,
         linear_attn=False,
-        sage_attention=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
@@ -213,7 +201,6 @@ class Transformer(Module):
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             else:
                 attn = Attention(
@@ -223,7 +210,6 @@ class Transformer(Module):
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             self.layers.append(ModuleList([
@@ -288,7 +274,6 @@ class ConformerBlock(nn.Module):
         conv_kernel_size=31,
         rotary_embed=None,
         flash_attn=True,
-        sage_attention=False
     ):
         super().__init__()
         self.ff1 = MacaronFF(dim=dim, mult=ff_mult, dropout=ff_dropout)
@@ -299,7 +284,6 @@ class ConformerBlock(nn.Module):
             dropout=attn_dropout,
             rotary_embed=rotary_embed,
             flash=flash_attn,
-            sage_attention=sage_attention
         )
         self.conv = ConformerConvModule(
             dim=dim,
@@ -331,7 +315,6 @@ class Conformer(Module):
         ff_mult=4,
         rotary_embed=None,
         flash_attn=True,
-        sage_attention=False,
         conv_expansion_factor=2,
         conv_kernel_size=31,
         norm_output=True
@@ -349,7 +332,6 @@ class Conformer(Module):
                 conv_kernel_size=conv_kernel_size,
                 rotary_embed=rotary_embed,
                 flash_attn=flash_attn,
-                sage_attention=sage_attention
             ) for _ in range(depth)
         ])
         self.norm = RMSNorm(dim) if norm_output else nn.Identity()
@@ -473,11 +455,11 @@ class BSConformer(Module):
         mlp_expansion_factor = 4,
         use_torch_checkpoint = False,
         skip_connection = False,
-        sage_attention = False,
         # conformer-specific
         ff_mult = 4,
         conv_expansion_factor = 2,
-        conv_kernel_size = 31
     ):
         super().__init__()
         self.stereo = stereo
@@ -488,9 +470,6 @@ class BSConformer(Module):
         self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
         transformer_kwargs = dict(
             dim = dim,
             heads = heads,
@@ -498,7 +477,6 @@ class BSConformer(Module):
             attn_dropout = attn_dropout,
             ff_dropout = ff_dropout,
             flash_attn = flash_attn,
-            sage_attention = sage_attention,
             norm_output = False
         )

 import torch.nn.functional as F
 from .attend import Attend
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
         dropout=0.,
         rotary_embed=None,
         flash=True,
     ):
         super().__init__()
         self.heads = heads
         self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
         scale=8,
         flash=True,
         dropout=0.,
+        **kwargs
     ):
         super().__init__()
         dim_inner = dim_head * heads
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
         rotary_embed=None,
         flash_attn=True,
         linear_attn=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
                 )
             else:
                 attn = Attention(
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
                 )
             self.layers.append(ModuleList([
         conv_kernel_size=31,
         rotary_embed=None,
         flash_attn=True,
     ):
         super().__init__()
         self.ff1 = MacaronFF(dim=dim, mult=ff_mult, dropout=ff_dropout)
             dropout=attn_dropout,
             rotary_embed=rotary_embed,
             flash=flash_attn,
         )
         self.conv = ConformerConvModule(
             dim=dim,
         ff_mult=4,
         rotary_embed=None,
         flash_attn=True,
         conv_expansion_factor=2,
         conv_kernel_size=31,
         norm_output=True
                 conv_kernel_size=conv_kernel_size,
                 rotary_embed=rotary_embed,
                 flash_attn=flash_attn,
             ) for _ in range(depth)
         ])
         self.norm = RMSNorm(dim) if norm_output else nn.Identity()
         mlp_expansion_factor = 4,
         use_torch_checkpoint = False,
         skip_connection = False,
         # conformer-specific
         ff_mult = 4,
         conv_expansion_factor = 2,
+        conv_kernel_size = 31,
+        **kwargs
     ):
         super().__init__()
         self.stereo = stereo
         self.layers = ModuleList([])
         transformer_kwargs = dict(
             dim = dim,
             heads = heads,
             attn_dropout = attn_dropout,
             ff_dropout = ff_dropout,
             flash_attn = flash_attn,
             norm_output = False
         )

models/bs_roformer/bs_roformer.py CHANGED Viewed

@@ -1,767 +1,768 @@
-from functools import partial
-import torch
-from torch import nn, einsum, tensor, Tensor
-from torch.nn import Module, ModuleList
-import torch.nn.functional as F
-from .attend import Attend
-from torch.utils.checkpoint import checkpoint
-from beartype.typing import Tuple, Optional, List, Callable
-from beartype import beartype
-from rotary_embedding_torch import RotaryEmbedding
-from einops import rearrange, pack, unpack
-from einops.layers.torch import Rearrange
-try:
-    from .pope.attention import flash_attn_with_pope
-    from .pope.pope import PoPE
-    _HAS_POPE = True
-except Exception:
-    PoPE = None
-    flash_attn_with_pope = None
-    _HAS_POPE = False
-# helper functions
-def exists(val):
-    return val is not None
-def default(v, d):
-    return v if exists(v) else d
-def pack_one(t, pattern):
-    return pack([t], pattern)
-def unpack_one(t, ps, pattern):
-    return unpack(t, ps, pattern)[0]
-# norm
-def l2norm(t):
-    return F.normalize(t, dim = -1, p = 2)
-class RMSNorm(Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = dim ** 0.5
-        self.gamma = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        return F.normalize(x, dim=-1) * self.scale * self.gamma
-# attention
-class FeedForward(Module):
-    def __init__(
-            self,
-            dim,
-            mult=4,
-            dropout=0.
-    ):
-        super().__init__()
-        dim_inner = int(dim * mult)
-        self.net = nn.Sequential(
-            RMSNorm(dim),
-            nn.Linear(dim, dim_inner),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_inner, dim),
-            nn.Dropout(dropout)
-        )
-    def forward(self, x):
-        return self.net(x)
-class Attention(Module):
-    def __init__(
-            self,
-            dim,
-            heads=8,
-            dim_head=64,
-            dropout=0.,
-            rotary_embed=None,
-            flash=True,
-            pope_embed=None,
-            learned_value_residual_mix = False
-    ):
-        super().__init__()
-        self.heads = heads
-        self.scale = dim_head ** -0.5
-        dim_inner = heads * dim_head
-        self.rotary_embed = rotary_embed
-        self.pope_embed = pope_embed
-        assert not (self.rotary_embed is not None and self.pope_embed is not None), \
-            "cannot have both rotary and pope embeddings"
-        self.attend = Attend(flash=flash, dropout=dropout)
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
-        self.to_value_residual_mix = nn.Linear(dim, heads) if learned_value_residual_mix else None
-        self.to_gates = nn.Linear(dim, heads)
-        self.to_out = nn.Sequential(
-            nn.Linear(dim_inner, dim, bias=False),
-            nn.Dropout(dropout)
-        )
-    def forward(self, x, value_residual = None):
-        x = self.norm(x)
-        q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads)
-        orig_v = v
-        if exists(self.pope_embed):
-            out = flash_attn_with_pope(
-                q, k, v,
-                pos_emb=self.pope_embed(q.shape[-2]),
-                softmax_scale=self.scale
-            )
-        elif exists(self.rotary_embed):
-            q = self.rotary_embed.rotate_queries_or_keys(q)
-            k = self.rotary_embed.rotate_queries_or_keys(k)
-            out = self.attend(q, k, v)
-        elif exists(self.to_value_residual_mix):
-            mix = self.to_value_residual_mix(x)
-            mix = rearrange(mix, 'b n h -> b h n 1').sigmoid()
-            assert exists(value_residual)
-            v = v.lerp(value_residual, mix)
-            out = self.attend(q, k, v)
-        else:
-            out = self.attend(q, k, v)
-        gates = self.to_gates(x)
-        out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid()
-        out = rearrange(out, 'b h n d -> b n (h d)')
-        return self.to_out(out), orig_v
-class LinearAttention(Module):
-    """
-    this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al.
-    """
-    @beartype
-    def __init__(
-            self,
-            *,
-            dim,
-            dim_head=32,
-            heads=8,
-            scale=8,
-            flash=False,
-            dropout=0.
-    ):
-        super().__init__()
-        dim_inner = dim_head * heads
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Sequential(
-            nn.Linear(dim, dim_inner * 3, bias=False),
-            Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads)
-        )
-        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        self.attend = Attend(
-            scale=scale,
-            dropout=dropout,
-            flash=flash
-        )
-        self.to_out = nn.Sequential(
-            Rearrange('b h d n -> b n (h d)'),
-            nn.Linear(dim_inner, dim, bias=False)
-        )
-    def forward(
-            self,
-            x
-    ):
-        x = self.norm(x)
-        q, k, v = self.to_qkv(x)
-        q, k = map(l2norm, (q, k))
-        q = q * self.temperature.exp()
-        out = self.attend(q, k, v)
-        return self.to_out(out)
-class Transformer(Module):
-    def __init__(
-            self,
-            *,
-            dim,
-            depth,
-            dim_head=64,
-            heads=8,
-            attn_dropout=0.,
-            ff_dropout=0.,
-            ff_mult=4,
-            norm_output=True,
-            rotary_embed=None,
-            pope_embed=None,
-            flash_attn=True,
-            linear_attn=False,
-            add_value_residual = False
-    ):
-        super().__init__()
-        self.layers = ModuleList([])
-        for _ in range(depth):
-            if linear_attn:
-                attn = LinearAttention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    flash=flash_attn
-                )
-            else:
-                attn = Attention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    rotary_embed=rotary_embed,
-                    pope_embed=pope_embed,
-                    flash=flash_attn,
-                    learned_value_residual_mix=add_value_residual
-                )
-            self.layers.append(ModuleList([
-                attn,
-                FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
-            ]))
-        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
-    def forward(self, x, value_residual=None):
-        first_values = None
-        for attn, ff in self.layers:
-            attn_out, next_values = attn(x, value_residual=value_residual)
-            if first_values is None:
-                first_values = next_values
-            x = attn_out + x
-            x = ff(x) + x
-        return self.norm(x), first_values
-# bandsplit module
-class BandSplit(Module):
-    @beartype
-    def __init__(
-            self,
-            dim,
-            dim_inputs: Tuple[int, ...]
-    ):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_features = ModuleList([])
-        for dim_in in dim_inputs:
-            net = nn.Sequential(
-                RMSNorm(dim_in),
-                nn.Linear(dim_in, dim)
-            )
-            self.to_features.append(net)
-    def forward(self, x):
-        x = x.split(self.dim_inputs, dim=-1)
-        outs = []
-        for split_input, to_feature in zip(x, self.to_features):
-            split_output = to_feature(split_input)
-            outs.append(split_output)
-        return torch.stack(outs, dim=-2)
-def MLP(
-        dim_in,
-        dim_out,
-        dim_hidden=None,
-        depth=1,
-        activation=nn.Tanh
-):
-    dim_hidden = default(dim_hidden, dim_in)
-    net = []
-    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
-    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
-        is_last = ind == (len(dims) - 2)
-        net.append(nn.Linear(layer_dim_in, layer_dim_out))
-        if is_last:
-            continue
-        net.append(activation())
-    return nn.Sequential(*net)
-class MaskEstimator(Module):
-    @beartype
-    def __init__(
-            self,
-            dim,
-            dim_inputs: Tuple[int, ...],
-            depth,
-            mlp_expansion_factor=4
-    ):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_freqs = ModuleList([])
-        dim_hidden = dim * mlp_expansion_factor
-        for dim_in in dim_inputs:
-            net = []
-            mlp = nn.Sequential(
-                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth),
-                nn.GLU(dim=-1)
-            )
-            self.to_freqs.append(mlp)
-    def forward(self, x):
-        x = x.unbind(dim=-2)
-        outs = []
-        for band_features, mlp in zip(x, self.to_freqs):
-            freq_out = mlp(band_features)
-            outs.append(freq_out)
-        return torch.cat(outs, dim=-1)
-# main class
-DEFAULT_FREQS_PER_BANDS = (
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    2, 2, 2, 2,
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-    12, 12, 12, 12, 12, 12, 12, 12,
-    24, 24, 24, 24, 24, 24, 24, 24,
-    48, 48, 48, 48, 48, 48, 48, 48,
-    128, 129,
-)
-class BSRoformer(Module):
-    @beartype
-    def __init__(
-            self,
-            dim,
-            *,
-            depth,
-            stereo=False,
-            num_stems=1,
-            time_transformer_depth=2,
-            freq_transformer_depth=2,
-            linear_transformer_depth=0,
-            freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
-            # in the paper, they divide into ~60 bands, test with 1 for starters
-            dim_head=64,
-            heads=8,
-            attn_dropout=0.,
-            ff_dropout=0.,
-            flash_attn=True,
-            dim_freqs_in=1025,
-            stft_n_fft=2048,
-            stft_hop_length=512,
-            # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
-            stft_win_length=2048,
-            stft_normalized=False,
-            stft_window_fn: Optional[Callable] = None,
-            zero_dc = True,
-            mask_estimator_depth=2,
-            multi_stft_resolution_loss_weight=1.,
-            multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
-            multi_stft_hop_size=147,
-            multi_stft_normalized=False,
-            multi_stft_window_fn: Callable = torch.hann_window,
-            mlp_expansion_factor=4,
-            use_torch_checkpoint=False,
-            skip_connection=False,
-            use_pope: bool = False,
-            residual_value: bool = False
-    ):
-        super().__init__()
-        self.stereo = stereo
-        self.audio_channels = 2 if stereo else 1
-        self.num_stems = num_stems
-        self.use_torch_checkpoint = use_torch_checkpoint
-        self.skip_connection = skip_connection
-        self.layers = ModuleList([])
-        transformer_kwargs = dict(
-            dim=dim,
-            heads=heads,
-            dim_head=dim_head,
-            attn_dropout=attn_dropout,
-            ff_dropout=ff_dropout,
-            flash_attn=flash_attn,
-            norm_output=False,
-        )
-        if use_pope:
-            time_pope_embed = PoPE(dim=dim_head, heads=heads)
-            freq_pope_embed = PoPE(dim=dim_head, heads=heads)
-            time_rotary_embed = None
-            freq_rotary_embed = None
-        else:
-            time_rotary_embed = RotaryEmbedding(dim = dim_head)
-            freq_rotary_embed = RotaryEmbedding(dim = dim_head)
-            time_pope_embed = freq_pope_embed = None
-        if residual_value:
-            for layer_index in range(depth):
-                tran_modules = []
-                is_first = layer_index == 0
-                if linear_transformer_depth > 0:
-                    tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, add_value_residual=not is_first, **transformer_kwargs))
-                tran_modules.append(
-                    Transformer(
-                        depth=time_transformer_depth,
-                        rotary_embed=time_rotary_embed,
-                        pope_embed=time_pope_embed,
-                        add_value_residual=not is_first,
-                        **transformer_kwargs
-                    )
-                )
-                tran_modules.append(
-                    Transformer(
-                        depth=freq_transformer_depth,
-                        rotary_embed=freq_rotary_embed,
-                        pope_embed=freq_pope_embed,
-                        add_value_residual=not is_first,
-                        **transformer_kwargs
-                    )
-                )
-                self.layers.append(nn.ModuleList(tran_modules))
-        else:
-            for layer_index in range(depth):
-                tran_modules = []
-                if linear_transformer_depth > 0:
-                    tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, add_value_residual=not is_first, **transformer_kwargs))
-                tran_modules.append(
-                    Transformer(
-                        depth=time_transformer_depth,
-                        rotary_embed=time_rotary_embed,
-                        pope_embed=time_pope_embed,
-                        add_value_residual=False,
-                        **transformer_kwargs
-                    )
-                )
-                tran_modules.append(
-                    Transformer(
-                        depth=freq_transformer_depth,
-                        rotary_embed=freq_rotary_embed,
-                        pope_embed=freq_pope_embed,
-                        add_value_residual=False,
-                        **transformer_kwargs
-                    )
-                )
-                self.layers.append(nn.ModuleList(tran_modules))
-        self.final_norm = RMSNorm(dim)
-        self.stft_kwargs = dict(
-            n_fft=stft_n_fft,
-            hop_length=stft_hop_length,
-            win_length=stft_win_length,
-            normalized=stft_normalized
-        )
-        self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
-        freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_win_length), return_complex=True).shape[1]
-        assert len(freqs_per_bands) > 1
-        assert sum(
-            freqs_per_bands) == freqs, f'the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}'
-        freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in freqs_per_bands)
-        self.band_split = BandSplit(
-            dim=dim,
-            dim_inputs=freqs_per_bands_with_complex
-        )
-        self.mask_estimators = nn.ModuleList([])
-        for _ in range(num_stems):
-            mask_estimator = MaskEstimator(
-                dim=dim,
-                dim_inputs=freqs_per_bands_with_complex,
-                depth=mask_estimator_depth,
-                mlp_expansion_factor=mlp_expansion_factor,
-            )
-            self.mask_estimators.append(mask_estimator)
-        # whether to zero out dc
-        self.zero_dc = zero_dc
-        # for the multi-resolution stft loss
-        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
-        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
-        self.multi_stft_n_fft = stft_n_fft
-        self.multi_stft_window_fn = multi_stft_window_fn
-        self.multi_stft_kwargs = dict(
-            hop_length=multi_stft_hop_size,
-            normalized=multi_stft_normalized
-        )
-    def forward(
-            self,
-            raw_audio,
-            target=None,
-            active_stem_ids=None,
-            return_loss_breakdown=False
-    ):
-        """
-        einops
-        b - batch
-        f - freq
-        t - time
-        s - audio channel (1 for mono, 2 for stereo)
-        n - number of 'stems'
-        c - complex (2)
-        d - feature dimension
-        """
-        device = raw_audio.device
-        x_is_mps = True if device.type == "mps" else False
-        if raw_audio.ndim == 2:
-            raw_audio = rearrange(raw_audio, 'b t -> b 1 t')
-        channels = raw_audio.shape[1]
-        assert (not self.stereo and channels == 1) or (
-                    self.stereo and channels == 2),\
-            ('stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2).'
-            ' also need to be False if mono (channel dimension of 1)')
-        # to stft
-        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t')
-        stft_window = self.stft_window_fn(device=device)
-        try:
-            stft_repr = torch.stft(
-                raw_audio,
-                **self.stft_kwargs,
-                window=stft_window,
-                return_complex=True
-            )
-        except:
-            stft_repr = torch.stft(
-                raw_audio.cpu() if x_is_mps else raw_audio,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=True
-            ).to(device)
-        stft_repr = torch.view_as_real(stft_repr)
-        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c')
-        # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
-        stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c')
-        x = rearrange(stft_repr, 'b f t c -> b t (f c)')
-        if self.use_torch_checkpoint:
-            x = checkpoint(self.band_split, x, use_reentrant=False)
-        else:
-            x = self.band_split(x)
-        # axial / hierarchical attention
-        store = [None] * len(self.layers)
-        # Initialize value residuals if residual_value is enabled
-        time_v_residual = None
-        freq_v_residual = None
-        for i, transformer_block in enumerate(self.layers):
-            if len(transformer_block) == 3:
-                linear_transformer, time_transformer, freq_transformer = transformer_block
-                x, ft_ps = pack([x], 'b * d')
-                if self.use_torch_checkpoint:
-                    linear_out, _ = checkpoint(linear_transformer, x, use_reentrant=False)
-                else:
-                    linear_out, _ = linear_transformer(x)
-                x, = unpack(linear_out, ft_ps, 'b * d')
-            else:
-                time_transformer, freq_transformer = transformer_block
-            if self.skip_connection:
-                # Sum all previous
-                for j in range(i):
-                    x = x + store[j]
-            # Time transformer
-            x = rearrange(x, 'b t f d -> b f t d')
-            x, ps = pack([x], '* t d')
-            if self.use_torch_checkpoint:
-                time_out, next_time_v_residual = checkpoint(time_transformer, x, use_reentrant=False)
-            else:
-                time_out, next_time_v_residual = time_transformer(x, value_residual=time_v_residual)
-            if time_v_residual is None:
-                time_v_residual = next_time_v_residual
-            x = time_out
-            x, = unpack(x, ps, '* t d')
-            x = rearrange(x, 'b f t d -> b t f d')
-            # Frequency transformer
-            x, ps = pack([x], '* f d')
-            if self.use_torch_checkpoint:
-                freq_out, next_freq_v_residual = checkpoint(freq_transformer, x, use_reentrant=False)
-            else:
-                freq_out, next_freq_v_residual = freq_transformer(x, value_residual=freq_v_residual)
-            if freq_v_residual is None:
-                freq_v_residual = next_freq_v_residual
-            x = freq_out
-            x, = unpack(x, ps, '* f d')
-            if self.skip_connection:
-                store[i] = x
-        x = self.final_norm(x)
-        if active_stem_ids is None:
-            heads = self.mask_estimators
-            stem_ids = list(range(len(self.mask_estimators)))
-        else:
-            heads = [self.mask_estimators[i] for i in active_stem_ids]
-            stem_ids = active_stem_ids
-        num_stems = len(heads)
-        if self.use_torch_checkpoint:
-            mask = torch.stack([checkpoint(fn, x, use_reentrant=False) for fn in heads], dim=1)
-        else:
-            mask = torch.stack([fn(x) for fn in heads], dim=1)
-        mask = rearrange(mask, 'b n t (f c) -> b n f t c', c=2)
-        # modulate frequency representation
-        stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c')
-        # complex number multiplication
-        stft_repr = torch.view_as_complex(stft_repr)
-        mask = torch.view_as_complex(mask)
-        stft_repr = stft_repr * mask
-        # istft
-        stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels)
-        if self.zero_dc:
-            # whether to dc filter
-            stft_repr = stft_repr.index_fill(1, tensor(0, device = device), 0.)
-        try:
-            recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, length=raw_audio.shape[-1])
-        except:
-            recon_audio = torch.istft(
-                stft_repr.cpu() if x_is_mps else stft_repr,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1]
-            ).to(device)
-        recon_audio = rearrange(
-            recon_audio,
-            '(b n s) t -> b n s t',
-            s=self.audio_channels,
-            n=num_stems
-        )
-        if not exists(target):
-            return recon_audio
-        if target.ndim == 2:
-            target = rearrange(target, '... t -> ... 1 t')
-        target = target[..., :recon_audio.shape[-1]]  # protect against lost length on istft
-        target_sel = target[:, stem_ids]
-        loss = F.l1_loss(recon_audio, target_sel)
-        multi_stft_resolution_loss = 0.
-        for window_size in self.multi_stft_resolutions_window_sizes:
-            res_stft_kwargs = dict(
-                n_fft=max(window_size, self.multi_stft_n_fft),  # not sure what n_fft is across multi resolution stft
-                win_length=window_size,
-                return_complex=True,
-                window=self.multi_stft_window_fn(window_size, device=device),
-                **self.multi_stft_kwargs,
-            )
-            recon_Y = torch.stft(rearrange(recon_audio, 'b n s t -> (b n s) t'),**res_stft_kwargs)
-            target_Y = torch.stft(rearrange(target_sel, 'b n s t -> (b n s) t'),**res_stft_kwargs)
-            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
-        weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
-        total_loss = loss + weighted_multi_resolution_loss
-        if not return_loss_breakdown:
-            return total_loss
         return total_loss, (loss, multi_stft_resolution_loss)

+from functools import partial
+import torch
+from torch import nn, einsum, tensor, Tensor
+from torch.nn import Module, ModuleList
+import torch.nn.functional as F
+from .attend import Attend
+from torch.utils.checkpoint import checkpoint
+from beartype.typing import Tuple, Optional, List, Callable
+from beartype import beartype
+from rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange, pack, unpack
+from einops.layers.torch import Rearrange
+try:
+    from .pope.attention import flash_attn_with_pope
+    from .pope.pope import PoPE
+    _HAS_POPE = True
+except Exception:
+    PoPE = None
+    flash_attn_with_pope = None
+    _HAS_POPE = False
+# helper functions
+def exists(val):
+    return val is not None
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+# norm
+def l2norm(t):
+    return F.normalize(t, dim = -1, p = 2)
+class RMSNorm(Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return F.normalize(x, dim=-1) * self.scale * self.gamma
+# attention
+class FeedForward(Module):
+    def __init__(
+            self,
+            dim,
+            mult=4,
+            dropout=0.
+    ):
+        super().__init__()
+        dim_inner = int(dim * mult)
+        self.net = nn.Sequential(
+            RMSNorm(dim),
+            nn.Linear(dim, dim_inner),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_inner, dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(Module):
+    def __init__(
+            self,
+            dim,
+            heads=8,
+            dim_head=64,
+            dropout=0.,
+            rotary_embed=None,
+            flash=True,
+            pope_embed=None,
+            learned_value_residual_mix = False
+    ):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+        dim_inner = heads * dim_head
+        self.rotary_embed = rotary_embed
+        self.pope_embed = pope_embed
+        assert not (self.rotary_embed is not None and self.pope_embed is not None), \
+            "cannot have both rotary and pope embeddings"
+        self.attend = Attend(flash=flash, dropout=dropout)
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
+        self.to_value_residual_mix = nn.Linear(dim, heads) if learned_value_residual_mix else None
+        self.to_gates = nn.Linear(dim, heads)
+        self.to_out = nn.Sequential(
+            nn.Linear(dim_inner, dim, bias=False),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x, value_residual = None):
+        x = self.norm(x)
+        q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads)
+        orig_v = v
+        if exists(self.pope_embed):
+            out = flash_attn_with_pope(
+                q, k, v,
+                pos_emb=self.pope_embed(q.shape[-2]),
+                softmax_scale=self.scale
+            )
+        elif exists(self.rotary_embed):
+            q = self.rotary_embed.rotate_queries_or_keys(q)
+            k = self.rotary_embed.rotate_queries_or_keys(k)
+            out = self.attend(q, k, v)
+        elif exists(self.to_value_residual_mix):
+            mix = self.to_value_residual_mix(x)
+            mix = rearrange(mix, 'b n h -> b h n 1').sigmoid()
+            assert exists(value_residual)
+            v = v.lerp(value_residual, mix)
+            out = self.attend(q, k, v)
+        else:
+            out = self.attend(q, k, v)
+        gates = self.to_gates(x)
+        out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid()
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out), orig_v
+class LinearAttention(Module):
+    """
+    this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al.
+    """
+    @beartype
+    def __init__(
+            self,
+            *,
+            dim,
+            dim_head=32,
+            heads=8,
+            scale=8,
+            flash=False,
+            dropout=0.
+    ):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Sequential(
+            nn.Linear(dim, dim_inner * 3, bias=False),
+            Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads)
+        )
+        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(
+            scale=scale,
+            dropout=dropout,
+            flash=flash
+        )
+        self.to_out = nn.Sequential(
+            Rearrange('b h d n -> b n (h d)'),
+            nn.Linear(dim_inner, dim, bias=False)
+        )
+    def forward(
+            self,
+            x
+    ):
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x)
+        q, k = map(l2norm, (q, k))
+        q = q * self.temperature.exp()
+        out = self.attend(q, k, v)
+        return self.to_out(out)
+class Transformer(Module):
+    def __init__(
+            self,
+            *,
+            dim,
+            depth,
+            dim_head=64,
+            heads=8,
+            attn_dropout=0.,
+            ff_dropout=0.,
+            ff_mult=4,
+            norm_output=True,
+            rotary_embed=None,
+            pope_embed=None,
+            flash_attn=True,
+            linear_attn=False,
+            add_value_residual = False
+    ):
+        super().__init__()
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            if linear_attn:
+                attn = LinearAttention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    flash=flash_attn
+                )
+            else:
+                attn = Attention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    rotary_embed=rotary_embed,
+                    pope_embed=pope_embed,
+                    flash=flash_attn,
+                    learned_value_residual_mix=add_value_residual
+                )
+            self.layers.append(ModuleList([
+                attn,
+                FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
+            ]))
+        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
+    def forward(self, x, value_residual=None):
+        first_values = None
+        for attn, ff in self.layers:
+            attn_out, next_values = attn(x, value_residual=value_residual)
+            if first_values is None:
+                first_values = next_values
+            x = attn_out + x
+            x = ff(x) + x
+        return self.norm(x), first_values
+# bandsplit module
+class BandSplit(Module):
+    @beartype
+    def __init__(
+            self,
+            dim,
+            dim_inputs: Tuple[int, ...]
+    ):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_features = ModuleList([])
+        for dim_in in dim_inputs:
+            net = nn.Sequential(
+                RMSNorm(dim_in),
+                nn.Linear(dim_in, dim)
+            )
+            self.to_features.append(net)
+    def forward(self, x):
+        x = x.split(self.dim_inputs, dim=-1)
+        outs = []
+        for split_input, to_feature in zip(x, self.to_features):
+            split_output = to_feature(split_input)
+            outs.append(split_output)
+        return torch.stack(outs, dim=-2)
+def MLP(
+        dim_in,
+        dim_out,
+        dim_hidden=None,
+        depth=1,
+        activation=nn.Tanh
+):
+    dim_hidden = default(dim_hidden, dim_in)
+    net = []
+    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+        is_last = ind == (len(dims) - 2)
+        net.append(nn.Linear(layer_dim_in, layer_dim_out))
+        if is_last:
+            continue
+        net.append(activation())
+    return nn.Sequential(*net)
+class MaskEstimator(Module):
+    @beartype
+    def __init__(
+            self,
+            dim,
+            dim_inputs: Tuple[int, ...],
+            depth,
+            mlp_expansion_factor=4
+    ):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_freqs = ModuleList([])
+        dim_hidden = dim * mlp_expansion_factor
+        for dim_in in dim_inputs:
+            net = []
+            mlp = nn.Sequential(
+                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth),
+                nn.GLU(dim=-1)
+            )
+            self.to_freqs.append(mlp)
+    def forward(self, x):
+        x = x.unbind(dim=-2)
+        outs = []
+        for band_features, mlp in zip(x, self.to_freqs):
+            freq_out = mlp(band_features)
+            outs.append(freq_out)
+        return torch.cat(outs, dim=-1)
+# main class
+DEFAULT_FREQS_PER_BANDS = (
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    2, 2, 2, 2,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    12, 12, 12, 12, 12, 12, 12, 12,
+    24, 24, 24, 24, 24, 24, 24, 24,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    128, 129,
+)
+class BSRoformer(Module):
+    @beartype
+    def __init__(
+            self,
+            dim,
+            *,
+            depth,
+            stereo=False,
+            num_stems=1,
+            time_transformer_depth=2,
+            freq_transformer_depth=2,
+            linear_transformer_depth=0,
+            freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
+            # in the paper, they divide into ~60 bands, test with 1 for starters
+            dim_head=64,
+            heads=8,
+            attn_dropout=0.,
+            ff_dropout=0.,
+            flash_attn=True,
+            dim_freqs_in=1025,
+            stft_n_fft=2048,
+            stft_hop_length=512,
+            # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
+            stft_win_length=2048,
+            stft_normalized=False,
+            stft_window_fn: Optional[Callable] = None,
+            zero_dc = True,
+            mask_estimator_depth=2,
+            multi_stft_resolution_loss_weight=1.,
+            multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+            multi_stft_hop_size=147,
+            multi_stft_normalized=False,
+            multi_stft_window_fn: Callable = torch.hann_window,
+            mlp_expansion_factor=4,
+            use_torch_checkpoint=False,
+            skip_connection=False,
+            use_pope: bool = False,
+            residual_value: bool = False,
+            **kwargs
+    ):
+        super().__init__()
+        self.stereo = stereo
+        self.audio_channels = 2 if stereo else 1
+        self.num_stems = num_stems
+        self.use_torch_checkpoint = use_torch_checkpoint
+        self.skip_connection = skip_connection
+        self.layers = ModuleList([])
+        transformer_kwargs = dict(
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            flash_attn=flash_attn,
+            norm_output=False,
+        )
+        if use_pope:
+            time_pope_embed = PoPE(dim=dim_head, heads=heads)
+            freq_pope_embed = PoPE(dim=dim_head, heads=heads)
+            time_rotary_embed = None
+            freq_rotary_embed = None
+        else:
+            time_rotary_embed = RotaryEmbedding(dim = dim_head)
+            freq_rotary_embed = RotaryEmbedding(dim = dim_head)
+            time_pope_embed = freq_pope_embed = None
+        if residual_value:
+            for layer_index in range(depth):
+                tran_modules = []
+                is_first = layer_index == 0
+                if linear_transformer_depth > 0:
+                    tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, add_value_residual=not is_first, **transformer_kwargs))
+                tran_modules.append(
+                    Transformer(
+                        depth=time_transformer_depth,
+                        rotary_embed=time_rotary_embed,
+                        pope_embed=time_pope_embed,
+                        add_value_residual=not is_first,
+                        **transformer_kwargs
+                    )
+                )
+                tran_modules.append(
+                    Transformer(
+                        depth=freq_transformer_depth,
+                        rotary_embed=freq_rotary_embed,
+                        pope_embed=freq_pope_embed,
+                        add_value_residual=not is_first,
+                        **transformer_kwargs
+                    )
+                )
+                self.layers.append(nn.ModuleList(tran_modules))
+        else:
+            for layer_index in range(depth):
+                tran_modules = []
+                if linear_transformer_depth > 0:
+                    tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, add_value_residual=not is_first, **transformer_kwargs))
+                tran_modules.append(
+                    Transformer(
+                        depth=time_transformer_depth,
+                        rotary_embed=time_rotary_embed,
+                        pope_embed=time_pope_embed,
+                        add_value_residual=False,
+                        **transformer_kwargs
+                    )
+                )
+                tran_modules.append(
+                    Transformer(
+                        depth=freq_transformer_depth,
+                        rotary_embed=freq_rotary_embed,
+                        pope_embed=freq_pope_embed,
+                        add_value_residual=False,
+                        **transformer_kwargs
+                    )
+                )
+                self.layers.append(nn.ModuleList(tran_modules))
+        self.final_norm = RMSNorm(dim)
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized
+        )
+        self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
+        freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_win_length), return_complex=True).shape[1]
+        assert len(freqs_per_bands) > 1
+        assert sum(
+            freqs_per_bands) == freqs, f'the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}'
+        freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in freqs_per_bands)
+        self.band_split = BandSplit(
+            dim=dim,
+            dim_inputs=freqs_per_bands_with_complex
+        )
+        self.mask_estimators = nn.ModuleList([])
+        for _ in range(num_stems):
+            mask_estimator = MaskEstimator(
+                dim=dim,
+                dim_inputs=freqs_per_bands_with_complex,
+                depth=mask_estimator_depth,
+                mlp_expansion_factor=mlp_expansion_factor,
+            )
+            self.mask_estimators.append(mask_estimator)
+        # whether to zero out dc
+        self.zero_dc = zero_dc
+        # for the multi-resolution stft loss
+        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+        self.multi_stft_n_fft = stft_n_fft
+        self.multi_stft_window_fn = multi_stft_window_fn
+        self.multi_stft_kwargs = dict(
+            hop_length=multi_stft_hop_size,
+            normalized=multi_stft_normalized
+        )
+    def forward(
+            self,
+            raw_audio,
+            target=None,
+            active_stem_ids=None,
+            return_loss_breakdown=False
+    ):
+        """
+        einops
+        b - batch
+        f - freq
+        t - time
+        s - audio channel (1 for mono, 2 for stereo)
+        n - number of 'stems'
+        c - complex (2)
+        d - feature dimension
+        """
+        device = raw_audio.device
+        x_is_mps = True if device.type == "mps" else False
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, 'b t -> b 1 t')
+        channels = raw_audio.shape[1]
+        assert (not self.stereo and channels == 1) or (
+                    self.stereo and channels == 2),\
+            ('stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2).'
+            ' also need to be False if mono (channel dimension of 1)')
+        # to stft
+        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t')
+        stft_window = self.stft_window_fn(device=device)
+        try:
+            stft_repr = torch.stft(
+                raw_audio,
+                **self.stft_kwargs,
+                window=stft_window,
+                return_complex=True
+            )
+        except:
+            stft_repr = torch.stft(
+                raw_audio.cpu() if x_is_mps else raw_audio,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=True
+            ).to(device)
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c')
+        # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
+        stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c')
+        x = rearrange(stft_repr, 'b f t c -> b t (f c)')
+        if self.use_torch_checkpoint:
+            x = checkpoint(self.band_split, x, use_reentrant=False)
+        else:
+            x = self.band_split(x)
+        # axial / hierarchical attention
+        store = [None] * len(self.layers)
+        # Initialize value residuals if residual_value is enabled
+        time_v_residual = None
+        freq_v_residual = None
+        for i, transformer_block in enumerate(self.layers):
+            if len(transformer_block) == 3:
+                linear_transformer, time_transformer, freq_transformer = transformer_block
+                x, ft_ps = pack([x], 'b * d')
+                if self.use_torch_checkpoint:
+                    linear_out, _ = checkpoint(linear_transformer, x, use_reentrant=False)
+                else:
+                    linear_out, _ = linear_transformer(x)
+                x, = unpack(linear_out, ft_ps, 'b * d')
+            else:
+                time_transformer, freq_transformer = transformer_block
+            if self.skip_connection:
+                # Sum all previous
+                for j in range(i):
+                    x = x + store[j]
+            # Time transformer
+            x = rearrange(x, 'b t f d -> b f t d')
+            x, ps = pack([x], '* t d')
+            if self.use_torch_checkpoint:
+                time_out, next_time_v_residual = checkpoint(time_transformer, x, use_reentrant=False)
+            else:
+                time_out, next_time_v_residual = time_transformer(x, value_residual=time_v_residual)
+            if time_v_residual is None:
+                time_v_residual = next_time_v_residual
+            x = time_out
+            x, = unpack(x, ps, '* t d')
+            x = rearrange(x, 'b f t d -> b t f d')
+            # Frequency transformer
+            x, ps = pack([x], '* f d')
+            if self.use_torch_checkpoint:
+                freq_out, next_freq_v_residual = checkpoint(freq_transformer, x, use_reentrant=False)
+            else:
+                freq_out, next_freq_v_residual = freq_transformer(x, value_residual=freq_v_residual)
+            if freq_v_residual is None:
+                freq_v_residual = next_freq_v_residual
+            x = freq_out
+            x, = unpack(x, ps, '* f d')
+            if self.skip_connection:
+                store[i] = x
+        x = self.final_norm(x)
+        if active_stem_ids is None:
+            heads = self.mask_estimators
+            stem_ids = list(range(len(self.mask_estimators)))
+        else:
+            heads = [self.mask_estimators[i] for i in active_stem_ids]
+            stem_ids = active_stem_ids
+        num_stems = len(heads)
+        if self.use_torch_checkpoint:
+            mask = torch.stack([checkpoint(fn, x, use_reentrant=False) for fn in heads], dim=1)
+        else:
+            mask = torch.stack([fn(x) for fn in heads], dim=1)
+        mask = rearrange(mask, 'b n t (f c) -> b n f t c', c=2)
+        # modulate frequency representation
+        stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c')
+        # complex number multiplication
+        stft_repr = torch.view_as_complex(stft_repr)
+        mask = torch.view_as_complex(mask)
+        stft_repr = stft_repr * mask
+        # istft
+        stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels)
+        if self.zero_dc:
+            # whether to dc filter
+            stft_repr = stft_repr.index_fill(1, tensor(0, device = device), 0.)
+        try:
+            recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False, length=raw_audio.shape[-1])
+        except:
+            recon_audio = torch.istft(
+                stft_repr.cpu() if x_is_mps else stft_repr,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1]
+            ).to(device)
+        recon_audio = rearrange(
+            recon_audio,
+            '(b n s) t -> b n s t',
+            s=self.audio_channels,
+            n=num_stems
+        )
+        if not exists(target):
+            return recon_audio
+        if target.ndim == 2:
+            target = rearrange(target, '... t -> ... 1 t')
+        target = target[..., :recon_audio.shape[-1]]  # protect against lost length on istft
+        target_sel = target[:, stem_ids]
+        loss = F.l1_loss(recon_audio, target_sel)
+        multi_stft_resolution_loss = 0.
+        for window_size in self.multi_stft_resolutions_window_sizes:
+            res_stft_kwargs = dict(
+                n_fft=max(window_size, self.multi_stft_n_fft),  # not sure what n_fft is across multi resolution stft
+                win_length=window_size,
+                return_complex=True,
+                window=self.multi_stft_window_fn(window_size, device=device),
+                **self.multi_stft_kwargs,
+            )
+            recon_Y = torch.stft(rearrange(recon_audio, 'b n s t -> (b n s) t'),**res_stft_kwargs)
+            target_Y = torch.stft(rearrange(target_sel, 'b n s t -> (b n s) t'),**res_stft_kwargs)
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
+        weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        total_loss = loss + weighted_multi_resolution_loss
+        if not return_loss_breakdown:
+            return total_loss
         return total_loss, (loss, multi_stft_resolution_loss)

models/bs_roformer/bs_roformer_conditional.py CHANGED Viewed

@@ -7,10 +7,7 @@ import torch.nn.functional as F
 from .attend import Attend
 from .conditioner import BandEmbedder
-try:
-    from .attend_sage import Attend as AttendSage
-except:
-    pass
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
@@ -88,7 +85,6 @@ class Attention(Module):
             dropout=0.,
             rotary_embed=None,
             flash=True,
-            sage_attention=False,
     ):
         super().__init__()
         self.heads = heads
@@ -97,10 +93,7 @@ class Attention(Module):
         self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
@@ -145,7 +138,6 @@ class LinearAttention(Module):
             scale=8,
             flash=True,
             dropout=0.,
-            sage_attention=False,
     ):
         super().__init__()
         dim_inner = dim_head * heads
@@ -158,18 +150,11 @@ class LinearAttention(Module):
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(
-                scale=scale,
-                dropout=dropout,
-                flash=flash
-            )
-        else:
-            self.attend = Attend(
-                scale=scale,
-                dropout=dropout,
-                flash=flash
-            )
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
@@ -207,7 +192,6 @@ class Transformer(Module):
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
-            sage_attention=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
@@ -220,7 +204,6 @@ class Transformer(Module):
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             else:
                 attn = Attention(
@@ -230,7 +213,6 @@ class Transformer(Module):
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             self.layers.append(ModuleList([
@@ -268,7 +250,6 @@ class ScaleTransformer(Module):
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
-            sage_attention=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
@@ -285,7 +266,6 @@ class ScaleTransformer(Module):
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             else:
                 attn = Attention(
@@ -295,7 +275,6 @@ class ScaleTransformer(Module):
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
@@ -353,7 +332,6 @@ class Transformer(Module):
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
-            sage_attention=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
@@ -366,7 +344,6 @@ class Transformer(Module):
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             else:
                 attn = Attention(
@@ -376,7 +353,6 @@ class Transformer(Module):
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             self.layers.append(ModuleList([
@@ -538,7 +514,7 @@ class BandConditionalBSRoformer(Module):
             mlp_expansion_factor=4,
             use_torch_checkpoint=False,
             skip_connection=False,
-            sage_attention=False,
     ):
         super().__init__()
@@ -551,9 +527,6 @@ class BandConditionalBSRoformer(Module):
         self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
         transformer_kwargs = dict(
             dim=dim,
             heads=heads,
@@ -562,7 +535,6 @@ class BandConditionalBSRoformer(Module):
             ff_dropout=ff_dropout,
             flash_attn=flash_attn,
             norm_output=False,
-            sage_attention=sage_attention,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)
@@ -884,7 +856,7 @@ class BSRoformer_Conditional(Module):
             mlp_expansion_factor=4,
             use_torch_checkpoint=False,
             skip_connection=False,
-            sage_attention=False,
     ):
         super().__init__()
@@ -896,9 +868,6 @@ class BSRoformer_Conditional(Module):
         self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
         transformer_kwargs = dict(
             dim=dim,
             heads=heads,
@@ -907,7 +876,6 @@ class BSRoformer_Conditional(Module):
             ff_dropout=ff_dropout,
             flash_attn=flash_attn,
             norm_output=False,
-            sage_attention=sage_attention,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)

 from .attend import Attend
 from .conditioner import BandEmbedder
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
             dropout=0.,
             rotary_embed=None,
             flash=True,
     ):
         super().__init__()
         self.heads = heads
         self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
             scale=8,
             flash=True,
             dropout=0.,
     ):
         super().__init__()
         dim_inner = dim_head * heads
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(
+            scale=scale,
+            dropout=dropout,
+            flash=flash
+        )
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
                 )
             else:
                 attn = Attention(
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
                 )
             self.layers.append(ModuleList([
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
                 )
             else:
                 attn = Attention(
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
                 )
             norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
                 )
             else:
                 attn = Attention(
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
                 )
             self.layers.append(ModuleList([
             mlp_expansion_factor=4,
             use_torch_checkpoint=False,
             skip_connection=False,
+            **kwargs
     ):
         super().__init__()
         self.layers = ModuleList([])
         transformer_kwargs = dict(
             dim=dim,
             heads=heads,
             ff_dropout=ff_dropout,
             flash_attn=flash_attn,
             norm_output=False,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)
             mlp_expansion_factor=4,
             use_torch_checkpoint=False,
             skip_connection=False,
+            **kwargs
     ):
         super().__init__()
         self.layers = ModuleList([])
         transformer_kwargs = dict(
             dim=dim,
             heads=heads,
             ff_dropout=ff_dropout,
             flash_attn=flash_attn,
             norm_output=False,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)

models/bs_roformer/bs_roformer_fno.py CHANGED Viewed

@@ -1,704 +1,685 @@
-from functools import partial
-import torch
-from torch import nn, einsum, Tensor
-from torch.nn import Module, ModuleList
-import torch.nn.functional as F
-from .fno1d import FNO1d
-from .attend import Attend
-try:
-    from .attend_sage import Attend as AttendSage
-except:
-    pass
-from torch.utils.checkpoint import checkpoint
-from beartype.typing import Tuple, Optional, List, Callable
-from beartype import beartype
-from rotary_embedding_torch import RotaryEmbedding
-from einops import rearrange, pack, unpack
-from einops.layers.torch import Rearrange
-def exists(val):
-    return val is not None
-def default(v, d):
-    return v if exists(v) else d
-def pack_one(t, pattern):
-    return pack([t], pattern)
-def unpack_one(t, ps, pattern):
-    return unpack(t, ps, pattern)[0]
-def l2norm(t):
-    return F.normalize(t, dim=-1, p=2)
-class RMSNorm(Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        return F.normalize(x, dim=-1) * self.scale * self.gamma
-class FeedForward(Module):
-    def __init__(self, dim, mult=4, dropout=0.0):
-        super().__init__()
-        dim_inner = int(dim * mult)
-        self.net = nn.Sequential(
-            RMSNorm(dim),
-            nn.Linear(dim, dim_inner),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_inner, dim),
-            nn.Dropout(dropout),
-        )
-    def forward(self, x):
-        return self.net(x)
-class Attention(Module):
-    def __init__(
-        self,
-        dim,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        rotary_embed=None,
-        flash=True,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.heads = heads
-        self.scale = dim_head**-0.5
-        dim_inner = heads * dim_head
-        self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
-        self.to_gates = nn.Linear(dim, heads)
-        self.to_out = nn.Sequential(
-            nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
-        )
-    def forward(self, x):
-        x = self.norm(x)
-        q, k, v = rearrange(
-            self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
-        )
-        if exists(self.rotary_embed):
-            q = self.rotary_embed.rotate_queries_or_keys(q)
-            k = self.rotary_embed.rotate_queries_or_keys(k)
-        out = self.attend(q, k, v)
-        gates = self.to_gates(x)
-        out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()
-        out = rearrange(out, "b h n d -> b n (h d)")
-        return self.to_out(out)
-class LinearAttention(Module):
-    @beartype
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_head=32,
-        heads=8,
-        scale=8,
-        flash=False,
-        dropout=0.0,
-        sage_attention=False,
-    ):
-        super().__init__()
-        dim_inner = dim_head * heads
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Sequential(
-            nn.Linear(dim, dim_inner * 3, bias=False),
-            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
-        )
-        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(scale=scale, dropout=dropout, flash=flash)
-        else:
-            self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
-        self.to_out = nn.Sequential(
-            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
-        )
-    def forward(self, x):
-        x = self.norm(x)
-        q, k, v = self.to_qkv(x)
-        q, k = map(l2norm, (q, k))
-        q = q * self.temperature.exp()
-        out = self.attend(q, k, v)
-        return self.to_out(out)
-class Transformer(Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        depth,
-        dim_head=64,
-        heads=8,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        ff_mult=4,
-        norm_output=True,
-        rotary_embed=None,
-        flash_attn=True,
-        linear_attn=False,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.layers = ModuleList([])
-        for _ in range(depth):
-            if linear_attn:
-                attn = LinearAttention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    flash=flash_attn,
-                    sage_attention=sage_attention,
-                )
-            else:
-                attn = Attention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    rotary_embed=rotary_embed,
-                    flash=flash_attn,
-                    sage_attention=sage_attention,
-                )
-            self.layers.append(
-                ModuleList(
-                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
-                )
-            )
-        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
-    def forward(self, x):
-        for attn, ff in self.layers:
-            x = attn(x) + x
-            x = ff(x) + x
-        return self.norm(x)
-class BandSplit(Module):
-    @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_features = ModuleList([])
-        for dim_in in dim_inputs:
-            net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
-            self.to_features.append(net)
-    def forward(self, x):
-        x = x.split(self.dim_inputs, dim=-1)
-        outs = []
-        for split_input, to_feature in zip(x, self.to_features):
-            split_output = to_feature(split_input)
-            outs.append(split_output)
-        return torch.stack(outs, dim=-2)
-def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
-    dim_hidden = default(dim_hidden, dim_in)
-    net = []
-    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
-    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
-        is_last = ind == (len(dims) - 2)
-        net.append(nn.Linear(layer_dim_in, layer_dim_out))
-        if is_last:
-            continue
-        net.append(activation())
-    return nn.Sequential(*net)
-class MaskEstimator(Module):
-    @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_freqs = ModuleList([])
-        dim_hidden = dim * mlp_expansion_factor
-        for dim_in in dim_inputs:
-            net = []
-            mlp = nn.Sequential(
-                FNO1d(
-                    n_modes_height=64,
-                    hidden_channels=dim,
-                    in_channels=dim,
-                    out_channels=dim_in * 2,
-                    lifting_channels=dim,
-                    projection_channels=dim,
-                    n_layers=3,
-                    separable=True,
-                ),
-                nn.GLU(dim=-2),
-            )
-            self.to_freqs.append(mlp)
-    def forward(self, x):
-        x = x.unbind(dim=-2)
-        outs = []
-        for band_features, mlp in zip(x, self.to_freqs):
-            band_features = rearrange(band_features, "b t c -> b c t")
-            with torch.autocast(device_type="cuda", enabled=False, dtype=torch.float32):
-                freq_out = mlp(band_features).float()
-            freq_out = rearrange(freq_out, "b c t -> b t c")
-            outs.append(freq_out)
-        return torch.cat(outs, dim=-1)
-DEFAULT_FREQS_PER_BANDS = (
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    128,
-    129,
-)
-class BSRoformer_FNO(Module):
-    @beartype
-    def __init__(
-        self,
-        dim,
-        *,
-        depth,
-        stereo=False,
-        num_stems=1,
-        time_transformer_depth=2,
-        freq_transformer_depth=2,
-        linear_transformer_depth=0,
-        freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
-        dim_head=64,
-        heads=8,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        flash_attn=True,
-        dim_freqs_in=1025,
-        stft_n_fft=2048,
-        stft_hop_length=512,
-        stft_win_length=2048,
-        stft_normalized=False,
-        stft_window_fn: Optional[Callable] = None,
-        mask_estimator_depth=2,
-        multi_stft_resolution_loss_weight=1.0,
-        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (
-            4096,
-            2048,
-            1024,
-            512,
-            256,
-        ),
-        multi_stft_hop_size=147,
-        multi_stft_normalized=False,
-        multi_stft_window_fn: Callable = torch.hann_window,
-        mlp_expansion_factor=4,
-        use_torch_checkpoint=False,
-        skip_connection=False,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.stereo = stereo
-        self.audio_channels = 2 if stereo else 1
-        self.num_stems = num_stems
-        self.use_torch_checkpoint = use_torch_checkpoint
-        self.skip_connection = skip_connection
-        self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
-        transformer_kwargs = dict(
-            dim=dim,
-            heads=heads,
-            dim_head=dim_head,
-            attn_dropout=attn_dropout,
-            ff_dropout=ff_dropout,
-            flash_attn=flash_attn,
-            norm_output=False,
-            sage_attention=sage_attention,
-        )
-        time_rotary_embed = RotaryEmbedding(dim=dim_head)
-        freq_rotary_embed = RotaryEmbedding(dim=dim_head)
-        for _ in range(depth):
-            tran_modules = []
-            if linear_transformer_depth > 0:
-                tran_modules.append(
-                    Transformer(
-                        depth=linear_transformer_depth,
-                        linear_attn=True,
-                        **transformer_kwargs,
-                    )
-                )
-            tran_modules.append(
-                Transformer(
-                    depth=time_transformer_depth,
-                    rotary_embed=time_rotary_embed,
-                    **transformer_kwargs,
-                )
-            )
-            tran_modules.append(
-                Transformer(
-                    depth=freq_transformer_depth,
-                    rotary_embed=freq_rotary_embed,
-                    **transformer_kwargs,
-                )
-            )
-            self.layers.append(nn.ModuleList(tran_modules))
-        self.final_norm = RMSNorm(dim)
-        self.stft_kwargs = dict(
-            n_fft=stft_n_fft,
-            hop_length=stft_hop_length,
-            win_length=stft_win_length,
-            normalized=stft_normalized,
-        )
-        self.stft_window_fn = partial(
-            default(stft_window_fn, torch.hann_window), stft_win_length
-        )
-        freqs = torch.stft(
-            torch.randn(1, 4096),
-            **self.stft_kwargs,
-            window=torch.ones(stft_win_length),
-            return_complex=True,
-        ).shape[1]
-        assert len(freqs_per_bands) > 1
-        assert (
-            sum(freqs_per_bands) == freqs
-        ), f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
-        freqs_per_bands_with_complex = tuple(
-            2 * f * self.audio_channels for f in freqs_per_bands
-        )
-        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
-        self.mask_estimators = nn.ModuleList([])
-        for _ in range(num_stems):
-            mask_estimator = MaskEstimator(
-                dim=dim,
-                dim_inputs=freqs_per_bands_with_complex,
-                depth=mask_estimator_depth,
-                mlp_expansion_factor=mlp_expansion_factor,
-            )
-            self.mask_estimators.append(mask_estimator)
-        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
-        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
-        self.multi_stft_n_fft = stft_n_fft
-        self.multi_stft_window_fn = multi_stft_window_fn
-        self.multi_stft_kwargs = dict(
-            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
-        )
-    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
-        device = raw_audio.device
-        x_is_mps = True if device.type == "mps" else False
-        if raw_audio.ndim == 2:
-            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
-        channels = raw_audio.shape[1]
-        assert (not self.stereo and channels == 1) or (
-            self.stereo and channels == 2
-        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
-        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
-        stft_window = self.stft_window_fn(device=device)
-        try:
-            stft_repr = torch.stft(
-                raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
-            )
-        except:
-            stft_repr = torch.stft(
-                raw_audio.cpu() if x_is_mps else raw_audio,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=True,
-            ).to(device)
-        stft_repr = torch.view_as_real(stft_repr)
-        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
-        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
-        x = rearrange(stft_repr, "b f t c -> b t (f c)")
-        if self.use_torch_checkpoint:
-            x = checkpoint(self.band_split, x, use_reentrant=False)
-        else:
-            x = self.band_split(x)
-        store = [None] * len(self.layers)
-        for i, transformer_block in enumerate(self.layers):
-            if len(transformer_block) == 3:
-                linear_transformer, time_transformer, freq_transformer = (
-                    transformer_block
-                )
-                x, ft_ps = pack([x], "b * d")
-                if self.use_torch_checkpoint:
-                    x = checkpoint(linear_transformer, x, use_reentrant=False)
-                else:
-                    x = linear_transformer(x)
-                (x,) = unpack(x, ft_ps, "b * d")
-            else:
-                time_transformer, freq_transformer = transformer_block
-            if self.skip_connection:
-                for j in range(i):
-                    x = x + store[j]
-            x = rearrange(x, "b t f d -> b f t d")
-            x, ps = pack([x], "* t d")
-            if self.use_torch_checkpoint:
-                x = checkpoint(time_transformer, x, use_reentrant=False)
-            else:
-                x = time_transformer(x)
-            (x,) = unpack(x, ps, "* t d")
-            x = rearrange(x, "b f t d -> b t f d")
-            x, ps = pack([x], "* f d")
-            if self.use_torch_checkpoint:
-                x = checkpoint(freq_transformer, x, use_reentrant=False)
-            else:
-                x = freq_transformer(x)
-            (x,) = unpack(x, ps, "* f d")
-            if self.skip_connection:
-                store[i] = x
-        x = self.final_norm(x)
-        num_stems = len(self.mask_estimators)
-        if self.use_torch_checkpoint:
-            mask = torch.stack(
-                [checkpoint(fn, x, use_reentrant=False) for fn in self.mask_estimators],
-                dim=1,
-            )
-        else:
-            mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
-        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
-        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
-        stft_repr = torch.view_as_complex(stft_repr)
-        mask = torch.view_as_complex(mask)
-        stft_repr = stft_repr * mask
-        stft_repr = rearrange(
-            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
-        )
-        try:
-            recon_audio = torch.istft(
-                stft_repr,
-                **self.stft_kwargs,
-                window=stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1],
-            )
-        except:
-            recon_audio = torch.istft(
-                stft_repr.cpu() if x_is_mps else stft_repr,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1],
-            ).to(device)
-        recon_audio = rearrange(
-            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems
-        )
-        if num_stems == 1:
-            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
-        if not exists(target):
-            return recon_audio
-        if self.num_stems > 1:
-            assert target.ndim == 4 and target.shape[1] == self.num_stems
-        if target.ndim == 2:
-            target = rearrange(target, "... t -> ... 1 t")
-        target = target[..., : recon_audio.shape[-1]]
-        loss = F.l1_loss(recon_audio, target)
-        multi_stft_resolution_loss = 0.0
-        for window_size in self.multi_stft_resolutions_window_sizes:
-            res_stft_kwargs = dict(
-                n_fft=max(window_size, self.multi_stft_n_fft),
-                win_length=window_size,
-                return_complex=True,
-                window=self.multi_stft_window_fn(window_size, device=device),
-                **self.multi_stft_kwargs,
-            )
-            recon_Y = torch.stft(
-                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
-            )
-            target_Y = torch.stft(
-                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
-            )
-            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
-                recon_Y, target_Y
-            )
-        weighted_multi_resolution_loss = (
-            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
-        )
-        total_loss = loss + weighted_multi_resolution_loss
-        if not return_loss_breakdown:
-            return total_loss
-        return total_loss, (loss, multi_stft_resolution_loss)

+from functools import partial
+import torch
+from torch import nn, einsum, Tensor
+from torch.nn import Module, ModuleList
+import torch.nn.functional as F
+from .fno1d import FNO1d
+from .attend import Attend
+from torch.utils.checkpoint import checkpoint
+from beartype.typing import Tuple, Optional, List, Callable
+from beartype import beartype
+from rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange, pack, unpack
+from einops.layers.torch import Rearrange
+def exists(val):
+    return val is not None
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def l2norm(t):
+    return F.normalize(t, dim=-1, p=2)
+class RMSNorm(Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return F.normalize(x, dim=-1) * self.scale * self.gamma
+class FeedForward(Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        dim_inner = int(dim * mult)
+        self.net = nn.Sequential(
+            RMSNorm(dim),
+            nn.Linear(dim, dim_inner),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_inner, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(Module):
+    def __init__(
+        self,
+        dim,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        rotary_embed=None,
+        flash=True,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        dim_inner = heads * dim_head
+        self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
+        self.to_gates = nn.Linear(dim, heads)
+        self.to_out = nn.Sequential(
+            nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = rearrange(
+            self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
+        )
+        if exists(self.rotary_embed):
+            q = self.rotary_embed.rotate_queries_or_keys(q)
+            k = self.rotary_embed.rotate_queries_or_keys(k)
+        out = self.attend(q, k, v)
+        gates = self.to_gates(x)
+        out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class LinearAttention(Module):
+    @beartype
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head=32,
+        heads=8,
+        scale=8,
+        flash=False,
+        dropout=0.0,
+    ):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Sequential(
+            nn.Linear(dim, dim_inner * 3, bias=False),
+            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
+        )
+        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
+        self.to_out = nn.Sequential(
+            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x)
+        q, k = map(l2norm, (q, k))
+        q = q * self.temperature.exp()
+        out = self.attend(q, k, v)
+        return self.to_out(out)
+class Transformer(Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        ff_mult=4,
+        norm_output=True,
+        rotary_embed=None,
+        flash_attn=True,
+        linear_attn=False,
+    ):
+        super().__init__()
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            if linear_attn:
+                attn = LinearAttention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    flash=flash_attn,
+                )
+            else:
+                attn = Attention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    rotary_embed=rotary_embed,
+                    flash=flash_attn,
+                )
+            self.layers.append(
+                ModuleList(
+                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
+                )
+            )
+        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+class BandSplit(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_features = ModuleList([])
+        for dim_in in dim_inputs:
+            net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
+            self.to_features.append(net)
+    def forward(self, x):
+        x = x.split(self.dim_inputs, dim=-1)
+        outs = []
+        for split_input, to_feature in zip(x, self.to_features):
+            split_output = to_feature(split_input)
+            outs.append(split_output)
+        return torch.stack(outs, dim=-2)
+def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
+    dim_hidden = default(dim_hidden, dim_in)
+    net = []
+    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+        is_last = ind == (len(dims) - 2)
+        net.append(nn.Linear(layer_dim_in, layer_dim_out))
+        if is_last:
+            continue
+        net.append(activation())
+    return nn.Sequential(*net)
+class MaskEstimator(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_freqs = ModuleList([])
+        dim_hidden = dim * mlp_expansion_factor
+        for dim_in in dim_inputs:
+            net = []
+            mlp = nn.Sequential(
+                FNO1d(
+                    n_modes_height=64,
+                    hidden_channels=dim,
+                    in_channels=dim,
+                    out_channels=dim_in * 2,
+                    lifting_channels=dim,
+                    projection_channels=dim,
+                    n_layers=3,
+                    separable=True,
+                ),
+                nn.GLU(dim=-2),
+            )
+            self.to_freqs.append(mlp)
+    def forward(self, x):
+        x = x.unbind(dim=-2)
+        outs = []
+        for band_features, mlp in zip(x, self.to_freqs):
+            band_features = rearrange(band_features, "b t c -> b c t")
+            with torch.autocast(device_type="cuda", enabled=False, dtype=torch.float32):
+                freq_out = mlp(band_features).float()
+            freq_out = rearrange(freq_out, "b c t -> b t c")
+            outs.append(freq_out)
+        return torch.cat(outs, dim=-1)
+DEFAULT_FREQS_PER_BANDS = (
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    128,
+    129,
+)
+class BSRoformer_FNO(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        *,
+        depth,
+        stereo=False,
+        num_stems=1,
+        time_transformer_depth=2,
+        freq_transformer_depth=2,
+        linear_transformer_depth=0,
+        freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        flash_attn=True,
+        dim_freqs_in=1025,
+        stft_n_fft=2048,
+        stft_hop_length=512,
+        stft_win_length=2048,
+        stft_normalized=False,
+        stft_window_fn: Optional[Callable] = None,
+        mask_estimator_depth=2,
+        multi_stft_resolution_loss_weight=1.0,
+        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (
+            4096,
+            2048,
+            1024,
+            512,
+            256,
+        ),
+        multi_stft_hop_size=147,
+        multi_stft_normalized=False,
+        multi_stft_window_fn: Callable = torch.hann_window,
+        mlp_expansion_factor=4,
+        use_torch_checkpoint=False,
+        skip_connection=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.stereo = stereo
+        self.audio_channels = 2 if stereo else 1
+        self.num_stems = num_stems
+        self.use_torch_checkpoint = use_torch_checkpoint
+        self.skip_connection = skip_connection
+        self.layers = ModuleList([])
+        transformer_kwargs = dict(
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            flash_attn=flash_attn,
+            norm_output=False,
+        )
+        time_rotary_embed = RotaryEmbedding(dim=dim_head)
+        freq_rotary_embed = RotaryEmbedding(dim=dim_head)
+        for _ in range(depth):
+            tran_modules = []
+            if linear_transformer_depth > 0:
+                tran_modules.append(
+                    Transformer(
+                        depth=linear_transformer_depth,
+                        linear_attn=True,
+                        **transformer_kwargs,
+                    )
+                )
+            tran_modules.append(
+                Transformer(
+                    depth=time_transformer_depth,
+                    rotary_embed=time_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            tran_modules.append(
+                Transformer(
+                    depth=freq_transformer_depth,
+                    rotary_embed=freq_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            self.layers.append(nn.ModuleList(tran_modules))
+        self.final_norm = RMSNorm(dim)
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized,
+        )
+        self.stft_window_fn = partial(
+            default(stft_window_fn, torch.hann_window), stft_win_length
+        )
+        freqs = torch.stft(
+            torch.randn(1, 4096),
+            **self.stft_kwargs,
+            window=torch.ones(stft_win_length),
+            return_complex=True,
+        ).shape[1]
+        assert len(freqs_per_bands) > 1
+        assert (
+            sum(freqs_per_bands) == freqs
+        ), f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
+        freqs_per_bands_with_complex = tuple(
+            2 * f * self.audio_channels for f in freqs_per_bands
+        )
+        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
+        self.mask_estimators = nn.ModuleList([])
+        for _ in range(num_stems):
+            mask_estimator = MaskEstimator(
+                dim=dim,
+                dim_inputs=freqs_per_bands_with_complex,
+                depth=mask_estimator_depth,
+                mlp_expansion_factor=mlp_expansion_factor,
+            )
+            self.mask_estimators.append(mask_estimator)
+        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+        self.multi_stft_n_fft = stft_n_fft
+        self.multi_stft_window_fn = multi_stft_window_fn
+        self.multi_stft_kwargs = dict(
+            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
+        )
+    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
+        device = raw_audio.device
+        x_is_mps = True if device.type == "mps" else False
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
+        channels = raw_audio.shape[1]
+        assert (not self.stereo and channels == 1) or (
+            self.stereo and channels == 2
+        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
+        stft_window = self.stft_window_fn(device=device)
+        try:
+            stft_repr = torch.stft(
+                raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
+            )
+        except:
+            stft_repr = torch.stft(
+                raw_audio.cpu() if x_is_mps else raw_audio,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=True,
+            ).to(device)
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
+        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
+        x = rearrange(stft_repr, "b f t c -> b t (f c)")
+        if self.use_torch_checkpoint:
+            x = checkpoint(self.band_split, x, use_reentrant=False)
+        else:
+            x = self.band_split(x)
+        store = [None] * len(self.layers)
+        for i, transformer_block in enumerate(self.layers):
+            if len(transformer_block) == 3:
+                linear_transformer, time_transformer, freq_transformer = (
+                    transformer_block
+                )
+                x, ft_ps = pack([x], "b * d")
+                if self.use_torch_checkpoint:
+                    x = checkpoint(linear_transformer, x, use_reentrant=False)
+                else:
+                    x = linear_transformer(x)
+                (x,) = unpack(x, ft_ps, "b * d")
+            else:
+                time_transformer, freq_transformer = transformer_block
+            if self.skip_connection:
+                for j in range(i):
+                    x = x + store[j]
+            x = rearrange(x, "b t f d -> b f t d")
+            x, ps = pack([x], "* t d")
+            if self.use_torch_checkpoint:
+                x = checkpoint(time_transformer, x, use_reentrant=False)
+            else:
+                x = time_transformer(x)
+            (x,) = unpack(x, ps, "* t d")
+            x = rearrange(x, "b f t d -> b t f d")
+            x, ps = pack([x], "* f d")
+            if self.use_torch_checkpoint:
+                x = checkpoint(freq_transformer, x, use_reentrant=False)
+            else:
+                x = freq_transformer(x)
+            (x,) = unpack(x, ps, "* f d")
+            if self.skip_connection:
+                store[i] = x
+        x = self.final_norm(x)
+        num_stems = len(self.mask_estimators)
+        if self.use_torch_checkpoint:
+            mask = torch.stack(
+                [checkpoint(fn, x, use_reentrant=False) for fn in self.mask_estimators],
+                dim=1,
+            )
+        else:
+            mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
+        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
+        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
+        stft_repr = torch.view_as_complex(stft_repr)
+        mask = torch.view_as_complex(mask)
+        stft_repr = stft_repr * mask
+        stft_repr = rearrange(
+            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
+        )
+        try:
+            recon_audio = torch.istft(
+                stft_repr,
+                **self.stft_kwargs,
+                window=stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1],
+            )
+        except:
+            recon_audio = torch.istft(
+                stft_repr.cpu() if x_is_mps else stft_repr,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1],
+            ).to(device)
+        recon_audio = rearrange(
+            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems
+        )
+        if num_stems == 1:
+            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
+        if not exists(target):
+            return recon_audio
+        if self.num_stems > 1:
+            assert target.ndim == 4 and target.shape[1] == self.num_stems
+        if target.ndim == 2:
+            target = rearrange(target, "... t -> ... 1 t")
+        target = target[..., : recon_audio.shape[-1]]
+        loss = F.l1_loss(recon_audio, target)
+        multi_stft_resolution_loss = 0.0
+        for window_size in self.multi_stft_resolutions_window_sizes:
+            res_stft_kwargs = dict(
+                n_fft=max(window_size, self.multi_stft_n_fft),
+                win_length=window_size,
+                return_complex=True,
+                window=self.multi_stft_window_fn(window_size, device=device),
+                **self.multi_stft_kwargs,
+            )
+            recon_Y = torch.stft(
+                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            target_Y = torch.stft(
+                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
+                recon_Y, target_Y
+            )
+        weighted_multi_resolution_loss = (
+            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        )
+        total_loss = loss + weighted_multi_resolution_loss
+        if not return_loss_breakdown:
+            return total_loss
+        return total_loss, (loss, multi_stft_resolution_loss)

models/bs_roformer/bs_roformer_hyperace.py CHANGED Viewed

@@ -1,1122 +1,1103 @@
-from functools import partial
-import torch
-from torch import nn, einsum, Tensor
-from torch.nn import Module, ModuleList
-import torch.nn.functional as F
-from .attend import Attend
-try:
-    from .attend_sage import Attend as AttendSage
-except:
-    pass
-from torch.utils.checkpoint import checkpoint
-from beartype.typing import Tuple, Optional, List, Callable
-from beartype import beartype
-from rotary_embedding_torch import RotaryEmbedding
-from einops import rearrange, pack, unpack
-from einops.layers.torch import Rearrange
-import torchaudio
-def exists(val):
-    return val is not None
-def default(v, d):
-    return v if exists(v) else d
-def pack_one(t, pattern):
-    return pack([t], pattern)
-def unpack_one(t, ps, pattern):
-    return unpack(t, ps, pattern)[0]
-def l2norm(t):
-    return F.normalize(t, dim=-1, p=2)
-class RMSNorm(Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        return F.normalize(x, dim=-1) * self.scale * self.gamma
-class FeedForward(Module):
-    def __init__(self, dim, mult=4, dropout=0.0):
-        super().__init__()
-        dim_inner = int(dim * mult)
-        self.net = nn.Sequential(
-            RMSNorm(dim),
-            nn.Linear(dim, dim_inner),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_inner, dim),
-            nn.Dropout(dropout),
-        )
-    def forward(self, x):
-        return self.net(x)
-class Attention(Module):
-    def __init__(
-        self,
-        dim,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        rotary_embed=None,
-        flash=True,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.heads = heads
-        self.scale = dim_head**-0.5
-        dim_inner = heads * dim_head
-        self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
-        self.to_gates = nn.Linear(dim, heads)
-        self.to_out = nn.Sequential(
-            nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
-        )
-    def forward(self, x):
-        x = self.norm(x)
-        q, k, v = rearrange(
-            self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
-        )
-        if exists(self.rotary_embed):
-            q = self.rotary_embed.rotate_queries_or_keys(q)
-            k = self.rotary_embed.rotate_queries_or_keys(k)
-        out = self.attend(q, k, v)
-        gates = self.to_gates(x)
-        out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()
-        out = rearrange(out, "b h n d -> b n (h d)")
-        return self.to_out(out)
-class LinearAttention(Module):
-    @beartype
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_head=32,
-        heads=8,
-        scale=8,
-        flash=True,
-        dropout=0.0,
-        sage_attention=False,
-    ):
-        super().__init__()
-        dim_inner = dim_head * heads
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Sequential(
-            nn.Linear(dim, dim_inner * 3, bias=False),
-            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
-        )
-        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(scale=scale, dropout=dropout, flash=flash)
-        else:
-            self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
-        self.to_out = nn.Sequential(
-            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
-        )
-    def forward(self, x):
-        x = self.norm(x)
-        q, k, v = self.to_qkv(x)
-        q, k = map(l2norm, (q, k))
-        q = q * self.temperature.exp()
-        out = self.attend(q, k, v)
-        return self.to_out(out)
-class Transformer(Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        depth,
-        dim_head=64,
-        heads=8,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        ff_mult=4,
-        norm_output=True,
-        rotary_embed=None,
-        flash_attn=True,
-        linear_attn=False,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.layers = ModuleList([])
-        for _ in range(depth):
-            if linear_attn:
-                attn = LinearAttention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    flash=flash_attn,
-                    sage_attention=sage_attention,
-                )
-            else:
-                attn = Attention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    rotary_embed=rotary_embed,
-                    flash=flash_attn,
-                    sage_attention=sage_attention,
-                )
-            self.layers.append(
-                ModuleList(
-                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
-                )
-            )
-        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
-    def forward(self, x):
-        for attn, ff in self.layers:
-            x = attn(x) + x
-            x = ff(x) + x
-        return self.norm(x)
-class BandSplit(Module):
-    @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_features = ModuleList([])
-        for dim_in in dim_inputs:
-            net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
-            self.to_features.append(net)
-    def forward(self, x):
-        x = x.split(self.dim_inputs, dim=-1)
-        outs = []
-        for split_input, to_feature in zip(x, self.to_features):
-            split_output = to_feature(split_input)
-            outs.append(split_output)
-        x = torch.stack(outs, dim=-2)
-        return x
-class Conv(nn.Module):
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
-        super().__init__()
-        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
-        self.bn = nn.InstanceNorm2d(c2, affine=True, eps=1e-8)
-        self.act = nn.SiLU() if act else nn.Identity()
-    def forward(self, x):
-        return self.act(self.bn(self.conv(x)))
-def autopad(k, p=None):
-    if p is None:
-        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
-    return p
-class DSConv(nn.Module):
-    def __init__(self, c1, c2, k=3, s=1, p=None, act=True):
-        super().__init__()
-        self.dwconv = nn.Conv2d(c1, c1, k, s, autopad(k, p), groups=c1, bias=False)
-        self.pwconv = nn.Conv2d(c1, c2, 1, 1, 0, bias=False)
-        self.bn = nn.InstanceNorm2d(c2, affine=True, eps=1e-8)
-        self.act = nn.SiLU() if act else nn.Identity()
-    def forward(self, x):
-        return self.act(self.bn(self.pwconv(self.dwconv(x))))
-class DS_Bottleneck(nn.Module):
-    def __init__(self, c1, c2, k=3, shortcut=True):
-        super().__init__()
-        c_ = c1
-        self.dsconv1 = DSConv(c1, c_, k=3, s=1)
-        self.dsconv2 = DSConv(c_, c2, k=k, s=1)
-        self.shortcut = shortcut and c1 == c2
-    def forward(self, x):
-        return (
-            x + self.dsconv2(self.dsconv1(x))
-            if self.shortcut
-            else self.dsconv2(self.dsconv1(x))
-        )
-class DS_C3k(nn.Module):
-    def __init__(self, c1, c2, n=1, k=3, e=0.5):
-        super().__init__()
-        c_ = int(c2 * e)
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.cv3 = Conv(2 * c_, c2, 1, 1)
-        self.m = nn.Sequential(
-            *[DS_Bottleneck(c_, c_, k=k, shortcut=True) for _ in range(n)]
-        )
-    def forward(self, x):
-        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
-class DS_C3k2(nn.Module):
-    def __init__(self, c1, c2, n=1, k=3, e=0.5):
-        super().__init__()
-        c_ = int(c2 * e)
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.m = DS_C3k(c_, c_, n=n, k=k, e=1.0)
-        self.cv2 = Conv(c_, c2, 1, 1)
-    def forward(self, x):
-        x_ = self.cv1(x)
-        x_ = self.m(x_)
-        return self.cv2(x_)
-class AdaptiveHyperedgeGeneration(nn.Module):
-    def __init__(self, in_channels, num_hyperedges, num_heads=8):
-        super().__init__()
-        self.num_hyperedges = num_hyperedges
-        self.num_heads = num_heads
-        self.head_dim = in_channels // num_heads
-        self.global_proto = nn.Parameter(torch.randn(num_hyperedges, in_channels))
-        self.context_mapper = nn.Linear(
-            2 * in_channels, num_hyperedges * in_channels, bias=False
-        )
-        self.query_proj = nn.Linear(in_channels, in_channels, bias=False)
-        self.scale = self.head_dim**-0.5
-    def forward(self, x):
-        B, N, C = x.shape
-        f_avg = F.adaptive_avg_pool1d(x.permute(0, 2, 1), 1).squeeze(-1)
-        f_max = F.adaptive_max_pool1d(x.permute(0, 2, 1), 1).squeeze(-1)
-        f_ctx = torch.cat((f_avg, f_max), dim=1)
-        delta_P = self.context_mapper(f_ctx).view(B, self.num_hyperedges, C)
-        P = self.global_proto.unsqueeze(0) + delta_P
-        z = self.query_proj(x)
-        z = z.view(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
-        P = P.view(B, self.num_hyperedges, self.num_heads, self.head_dim).permute(
-            0, 2, 3, 1
-        )
-        sim = (z @ P) * self.scale
-        s_bar = sim.mean(dim=1)
-        A = F.softmax(s_bar.permute(0, 2, 1), dim=-1)
-        return A
-class HypergraphConvolution(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.W_e = nn.Linear(in_channels, in_channels, bias=False)
-        self.W_v = nn.Linear(in_channels, out_channels, bias=False)
-        self.act = nn.SiLU()
-    def forward(self, x, A):
-        f_m = torch.bmm(A, x)
-        f_m = self.act(self.W_e(f_m))
-        x_out = torch.bmm(A.transpose(1, 2), f_m)
-        x_out = self.act(self.W_v(x_out))
-        return x + x_out
-class AdaptiveHypergraphComputation(nn.Module):
-    def __init__(self, in_channels, out_channels, num_hyperedges=8, num_heads=8):
-        super().__init__()
-        self.adaptive_hyperedge_gen = AdaptiveHyperedgeGeneration(
-            in_channels, num_hyperedges, num_heads
-        )
-        self.hypergraph_conv = HypergraphConvolution(in_channels, out_channels)
-    def forward(self, x):
-        B, C, H, W = x.shape
-        x_flat = x.flatten(2).permute(0, 2, 1)
-        A = self.adaptive_hyperedge_gen(x_flat)
-        x_out_flat = self.hypergraph_conv(x_flat, A)
-        x_out = x_out_flat.permute(0, 2, 1).view(B, -1, H, W)
-        return x_out
-class C3AH(nn.Module):
-    def __init__(self, c1, c2, num_hyperedges=8, num_heads=8, e=0.5):
-        super().__init__()
-        c_ = int(c1 * e)
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.ahc = AdaptiveHypergraphComputation(c_, c_, num_hyperedges, num_heads)
-        self.cv3 = Conv(2 * c_, c2, 1, 1)
-    def forward(self, x):
-        x_lateral = self.cv1(x)
-        x_ahc = self.ahc(self.cv2(x))
-        return self.cv3(torch.cat((x_ahc, x_lateral), dim=1))
-class HyperACE(nn.Module):
-    def __init__(
-        self,
-        in_channels: List[int],
-        out_channels: int,
-        num_hyperedges=8,
-        num_heads=8,
-        k=2,
-        l=1,
-        c_h=0.5,
-        c_l=0.25,
-    ):
-        super().__init__()
-        c2, c3, c4, c5 = in_channels
-        c_mid = c4
-        self.fuse_conv = Conv(c2 + c3 + c4 + c5, c_mid, 1, 1)
-        self.c_h = int(c_mid * c_h)
-        self.c_l = int(c_mid * c_l)
-        self.c_s = c_mid - self.c_h - self.c_l
-        assert self.c_s > 0, "Channel split error"
-        self.high_order_branch = nn.ModuleList(
-            [
-                C3AH(self.c_h, self.c_h, num_hyperedges, num_heads, e=1.0)
-                for _ in range(k)
-            ]
-        )
-        self.high_order_fuse = Conv(self.c_h * k, self.c_h, 1, 1)
-        self.low_order_branch = nn.Sequential(
-            *[DS_C3k(self.c_l, self.c_l, n=1, k=3, e=1.0) for _ in range(l)]
-        )
-        self.final_fuse = Conv(self.c_h + self.c_l + self.c_s, out_channels, 1, 1)
-    def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
-        B2, B3, B4, B5 = x
-        B, _, H4, W4 = B4.shape
-        B2_resized = F.interpolate(
-            B2, size=(H4, W4), mode="bilinear", align_corners=False
-        )
-        B3_resized = F.interpolate(
-            B3, size=(H4, W4), mode="bilinear", align_corners=False
-        )
-        B5_resized = F.interpolate(
-            B5, size=(H4, W4), mode="bilinear", align_corners=False
-        )
-        x_b = self.fuse_conv(torch.cat((B2_resized, B3_resized, B4, B5_resized), dim=1))
-        x_h, x_l, x_s = torch.split(x_b, [self.c_h, self.c_l, self.c_s], dim=1)
-        x_h_outs = [m(x_h) for m in self.high_order_branch]
-        x_h_fused = self.high_order_fuse(torch.cat(x_h_outs, dim=1))
-        x_l_out = self.low_order_branch(x_l)
-        y = self.final_fuse(torch.cat((x_h_fused, x_l_out, x_s), dim=1))
-        return y
-class GatedFusion(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.zeros(1, in_channels, 1, 1))
-    def forward(self, f_in, h):
-        if f_in.shape[1] != h.shape[1]:
-            raise ValueError(f"Channel mismatch: f_in={f_in.shape}, h={h.shape}")
-        return f_in + self.gamma * h
-class Backbone(nn.Module):
-    def __init__(self, in_channels=256, base_channels=64, base_depth=3):
-        super().__init__()
-        c = base_channels
-        c2 = base_channels
-        c3 = 256
-        c4 = 384
-        c5 = 512
-        c6 = 768
-        self.stem = DSConv(in_channels, c2, k=3, s=(2, 1), p=1)
-        self.p2 = nn.Sequential(
-            DSConv(c2, c3, k=3, s=(2, 1), p=1), DS_C3k2(c3, c3, n=base_depth)
-        )
-        self.p3 = nn.Sequential(
-            DSConv(c3, c4, k=3, s=(2, 1), p=1), DS_C3k2(c4, c4, n=base_depth * 2)
-        )
-        self.p4 = nn.Sequential(
-            DSConv(c4, c5, k=3, s=(2, 1), p=1), DS_C3k2(c5, c5, n=base_depth * 2)
-        )
-        self.p5 = nn.Sequential(
-            DSConv(c5, c6, k=3, s=(2, 1), p=1), DS_C3k2(c6, c6, n=base_depth)
-        )
-        self.out_channels = [c3, c4, c5, c6]
-    def forward(self, x):
-        x = self.stem(x)
-        x2 = self.p2(x)
-        x3 = self.p3(x2)
-        x4 = self.p4(x3)
-        x5 = self.p5(x4)
-        return [x2, x3, x4, x5]
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        encoder_channels: List[int],
-        hyperace_out_c: int,
-        decoder_channels: List[int],
-    ):
-        super().__init__()
-        c_p2, c_p3, c_p4, c_p5 = encoder_channels
-        c_d2, c_d3, c_d4, c_d5 = decoder_channels
-        self.h_to_d5 = Conv(hyperace_out_c, c_d5, 1, 1)
-        self.h_to_d4 = Conv(hyperace_out_c, c_d4, 1, 1)
-        self.h_to_d3 = Conv(hyperace_out_c, c_d3, 1, 1)
-        self.h_to_d2 = Conv(hyperace_out_c, c_d2, 1, 1)
-        self.fusion_d5 = GatedFusion(c_d5)
-        self.fusion_d4 = GatedFusion(c_d4)
-        self.fusion_d3 = GatedFusion(c_d3)
-        self.fusion_d2 = GatedFusion(c_d2)
-        self.skip_p5 = Conv(c_p5, c_d5, 1, 1)
-        self.skip_p4 = Conv(c_p4, c_d4, 1, 1)
-        self.skip_p3 = Conv(c_p3, c_d3, 1, 1)
-        self.skip_p2 = Conv(c_p2, c_d2, 1, 1)
-        self.up_d5 = DS_C3k2(c_d5, c_d4, n=1)
-        self.up_d4 = DS_C3k2(c_d4, c_d3, n=1)
-        self.up_d3 = DS_C3k2(c_d3, c_d2, n=1)
-        self.final_d2 = DS_C3k2(c_d2, c_d2, n=1)
-    def forward(self, enc_feats: List[torch.Tensor], h_ace: torch.Tensor):
-        p2, p3, p4, p5 = enc_feats
-        d5 = self.skip_p5(p5)
-        h_d5 = self.h_to_d5(F.interpolate(h_ace, size=d5.shape[2:], mode="bilinear"))
-        d5 = self.fusion_d5(d5, h_d5)
-        d5_up = F.interpolate(d5, size=p4.shape[2:], mode="bilinear")
-        d4_skip = self.skip_p4(p4)
-        d4 = self.up_d5(d5_up) + d4_skip
-        h_d4 = self.h_to_d4(F.interpolate(h_ace, size=d4.shape[2:], mode="bilinear"))
-        d4 = self.fusion_d4(d4, h_d4)
-        d4_up = F.interpolate(d4, size=p3.shape[2:], mode="bilinear")
-        d3_skip = self.skip_p3(p3)
-        d3 = self.up_d4(d4_up) + d3_skip
-        h_d3 = self.h_to_d3(F.interpolate(h_ace, size=d3.shape[2:], mode="bilinear"))
-        d3 = self.fusion_d3(d3, h_d3)
-        d3_up = F.interpolate(d3, size=p2.shape[2:], mode="bilinear")
-        d2_skip = self.skip_p2(p2)
-        d2 = self.up_d3(d3_up) + d2_skip
-        h_d2 = self.h_to_d2(F.interpolate(h_ace, size=d2.shape[2:], mode="bilinear"))
-        d2 = self.fusion_d2(d2, h_d2)
-        d2_final = self.final_d2(d2)
-        return d2_final
-class FreqPixelShuffle(nn.Module):
-    def __init__(self, in_channels, out_channels, scale=2):
-        super().__init__()
-        self.scale = scale
-        self.conv = DSConv(in_channels, out_channels * scale, k=3, s=1, p=1)
-        self.act = nn.SiLU()
-    def forward(self, x):
-        x = self.conv(x)
-        B, C_r, H, W = x.shape
-        out_c = C_r // self.scale
-        x = x.view(B, out_c, self.scale, H, W)
-        x = x.permute(0, 1, 3, 4, 2).contiguous()
-        x = x.view(B, out_c, H, W * self.scale)
-        return x
-class ProgressiveUpsampleHead(nn.Module):
-    def __init__(self, in_channels, out_channels, target_bins=1025):
-        super().__init__()
-        self.target_bins = target_bins
-        c = in_channels
-        self.block1 = FreqPixelShuffle(c, c, scale=2)
-        self.block2 = FreqPixelShuffle(c, c // 2, scale=2)
-        self.block3 = FreqPixelShuffle(c // 2, c // 2, scale=2)
-        self.block4 = FreqPixelShuffle(c // 2, c // 4, scale=2)
-        self.final_conv = nn.Conv2d(c // 4, out_channels, kernel_size=1, bias=False)
-    def forward(self, x):
-        x = self.block1(x)
-        x = self.block2(x)
-        x = self.block3(x)
-        x = self.block4(x)
-        if x.shape[-1] != self.target_bins:
-            x = F.interpolate(
-                x,
-                size=(x.shape[2], self.target_bins),
-                mode="bilinear",
-                align_corners=False,
-            )
-        x = self.final_conv(x)
-        return x
-class SegmModel(nn.Module):
-    def __init__(
-        self,
-        in_bands=62,
-        in_dim=256,
-        out_bins=1025,
-        out_channels=4,
-        base_channels=64,
-        base_depth=2,
-        num_hyperedges=16,
-        num_heads=8,
-    ):
-        super().__init__()
-        self.backbone = Backbone(
-            in_channels=in_dim, base_channels=base_channels, base_depth=base_depth
-        )
-        enc_channels = self.backbone.out_channels
-        c2, c3, c4, c5 = enc_channels
-        hyperace_in_channels = enc_channels
-        hyperace_out_channels = c4
-        self.hyperace = HyperACE(
-            hyperace_in_channels,
-            hyperace_out_channels,
-            num_hyperedges,
-            num_heads,
-            k=3,
-            l=2,
-        )
-        decoder_channels = [c2, c3, c4, c5]
-        self.decoder = Decoder(enc_channels, hyperace_out_channels, decoder_channels)
-        self.upsample_head = ProgressiveUpsampleHead(
-            in_channels=decoder_channels[0],
-            out_channels=out_channels,
-            target_bins=out_bins,
-        )
-    def forward(self, x):
-        H, W = x.shape[2:]
-        enc_feats = self.backbone(x)
-        h_ace_feats = self.hyperace(enc_feats)
-        dec_feat = self.decoder(enc_feats, h_ace_feats)
-        feat_time_restored = F.interpolate(
-            dec_feat, size=(H, dec_feat.shape[-1]), mode="bilinear", align_corners=False
-        )
-        out = self.upsample_head(feat_time_restored)
-        return out
-def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
-    dim_hidden = default(dim_hidden, dim_in)
-    net = []
-    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
-    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
-        is_last = ind == (len(dims) - 2)
-        net.append(nn.Linear(layer_dim_in, layer_dim_out))
-        if is_last:
-            continue
-        net.append(activation())
-    return nn.Sequential(*net)
-class MaskEstimator(Module):
-    @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_freqs = ModuleList([])
-        dim_hidden = dim * mlp_expansion_factor
-        for dim_in in dim_inputs:
-            net = []
-            mlp = nn.Sequential(
-                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
-            )
-            self.to_freqs.append(mlp)
-        self.segm = SegmModel(
-            in_bands=len(dim_inputs), in_dim=dim, out_bins=sum(dim_inputs) // 4
-        )
-    def forward(self, x):
-        y = rearrange(x, "b t f c -> b c t f")
-        y = self.segm(y)
-        y = rearrange(y, "b c t f -> b t (f c)")
-        x = x.unbind(dim=-2)
-        outs = []
-        for band_features, mlp in zip(x, self.to_freqs):
-            freq_out = mlp(band_features)
-            outs.append(freq_out)
-        return torch.cat(outs, dim=-1) + y
-DEFAULT_FREQS_PER_BANDS = (
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    128,
-    129,
-)
-class BSRoformerHyperACE(Module):
-    @beartype
-    def __init__(
-        self,
-        dim,
-        *,
-        depth,
-        stereo=False,
-        num_stems=1,
-        time_transformer_depth=2,
-        freq_transformer_depth=2,
-        linear_transformer_depth=0,
-        freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
-        dim_head=64,
-        heads=8,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        flash_attn=True,
-        dim_freqs_in=1025,
-        stft_n_fft=2048,
-        stft_hop_length=512,
-        stft_win_length=2048,
-        stft_normalized=False,
-        stft_window_fn: Optional[Callable] = None,
-        mask_estimator_depth=2,
-        multi_stft_resolution_loss_weight=1.0,
-        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (
-            4096,
-            2048,
-            1024,
-            512,
-            256,
-        ),
-        multi_stft_hop_size=147,
-        multi_stft_normalized=False,
-        multi_stft_window_fn: Callable = torch.hann_window,
-        mlp_expansion_factor=4,
-        use_torch_checkpoint=False,
-        skip_connection=False,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.stereo = stereo
-        self.audio_channels = 2 if stereo else 1
-        self.num_stems = num_stems
-        self.use_torch_checkpoint = use_torch_checkpoint
-        self.skip_connection = skip_connection
-        self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
-        transformer_kwargs = dict(
-            dim=dim,
-            heads=heads,
-            dim_head=dim_head,
-            attn_dropout=attn_dropout,
-            ff_dropout=ff_dropout,
-            flash_attn=flash_attn,
-            norm_output=False,
-            sage_attention=sage_attention,
-        )
-        time_rotary_embed = RotaryEmbedding(dim=dim_head)
-        freq_rotary_embed = RotaryEmbedding(dim=dim_head)
-        for _ in range(depth):
-            tran_modules = []
-            tran_modules.append(
-                Transformer(
-                    depth=time_transformer_depth,
-                    rotary_embed=time_rotary_embed,
-                    **transformer_kwargs,
-                )
-            )
-            tran_modules.append(
-                Transformer(
-                    depth=freq_transformer_depth,
-                    rotary_embed=freq_rotary_embed,
-                    **transformer_kwargs,
-                )
-            )
-            self.layers.append(nn.ModuleList(tran_modules))
-        self.final_norm = RMSNorm(dim)
-        self.stft_kwargs = dict(
-            n_fft=stft_n_fft,
-            hop_length=stft_hop_length,
-            win_length=stft_win_length,
-            normalized=stft_normalized,
-        )
-        self.stft_window_fn = partial(
-            default(stft_window_fn, torch.hann_window), stft_win_length
-        )
-        freqs = torch.stft(
-            torch.randn(1, 4096),
-            **self.stft_kwargs,
-            window=torch.ones(stft_win_length),
-            return_complex=True,
-        ).shape[1]
-        assert len(freqs_per_bands) > 1
-        assert (
-            sum(freqs_per_bands) == freqs
-        ), f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
-        freqs_per_bands_with_complex = tuple(
-            2 * f * self.audio_channels for f in freqs_per_bands
-        )
-        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
-        self.mask_estimators = nn.ModuleList([])
-        for _ in range(num_stems):
-            mask_estimator = MaskEstimator(
-                dim=dim,
-                dim_inputs=freqs_per_bands_with_complex,
-                depth=mask_estimator_depth,
-                mlp_expansion_factor=mlp_expansion_factor,
-            )
-            self.mask_estimators.append(mask_estimator)
-        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
-        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
-        self.multi_stft_n_fft = stft_n_fft
-        self.multi_stft_window_fn = multi_stft_window_fn
-        self.multi_stft_kwargs = dict(
-            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
-        )
-    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
-        device = raw_audio.device
-        x_is_mps = True if device.type == "mps" else False
-        if raw_audio.ndim == 2:
-            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
-        channels = raw_audio.shape[1]
-        assert (not self.stereo and channels == 1) or (
-            self.stereo and channels == 2
-        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
-        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
-        stft_window = self.stft_window_fn(device=device)
-        try:
-            stft_repr = torch.stft(
-                raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
-            )
-        except:
-            stft_repr = torch.stft(
-                raw_audio.cpu() if x_is_mps else raw_audio,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=True,
-            ).to(device)
-        stft_repr = torch.view_as_real(stft_repr)
-        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
-        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
-        x = rearrange(stft_repr, "b f t c -> b t (f c)")
-        x = self.band_split(x)
-        for i, transformer_block in enumerate(self.layers):
-            time_transformer, freq_transformer = transformer_block
-            x = rearrange(x, "b t f d -> b f t d")
-            x, ps = pack([x], "* t d")
-            x = time_transformer(x)
-            (x,) = unpack(x, ps, "* t d")
-            x = rearrange(x, "b f t d -> b t f d")
-            x, ps = pack([x], "* f d")
-            x = freq_transformer(x)
-            (x,) = unpack(x, ps, "* f d")
-        x = self.final_norm(x)
-        num_stems = len(self.mask_estimators)
-        mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
-        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
-        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
-        stft_repr = torch.view_as_complex(stft_repr)
-        mask = torch.view_as_complex(mask)
-        stft_repr = stft_repr * mask
-        stft_repr = rearrange(
-            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
-        )
-        try:
-            recon_audio = torch.istft(
-                stft_repr,
-                **self.stft_kwargs,
-                window=stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1],
-            )
-        except:
-            recon_audio = torch.istft(
-                stft_repr.cpu() if x_is_mps else stft_repr,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1],
-            ).to(device)
-        recon_audio = rearrange(
-            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems
-        )
-        if num_stems == 1:
-            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
-        if not exists(target):
-            return recon_audio
-        if self.num_stems > 1:
-            assert target.ndim == 4 and target.shape[1] == self.num_stems
-        if target.ndim == 2:
-            target = rearrange(target, "... t -> ... 1 t")
-        target = target[..., : recon_audio.shape[-1]]
-        loss = F.l1_loss(recon_audio, target)
-        multi_stft_resolution_loss = 0.0
-        for window_size in self.multi_stft_resolutions_window_sizes:
-            res_stft_kwargs = dict(
-                n_fft=max(window_size, self.multi_stft_n_fft),
-                win_length=window_size,
-                return_complex=True,
-                window=self.multi_stft_window_fn(window_size, device=device),
-                **self.multi_stft_kwargs,
-            )
-            recon_Y = torch.stft(
-                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
-            )
-            target_Y = torch.stft(
-                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
-            )
-            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
-                recon_Y, target_Y
-            )
-        weighted_multi_resolution_loss = (
-            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
-        )
-        total_loss = loss + weighted_multi_resolution_loss
-        if not return_loss_breakdown:
-            return total_loss
-        return total_loss, (loss, multi_stft_resolution_loss)

+from functools import partial
+import torch
+from torch import nn, einsum, Tensor
+from torch.nn import Module, ModuleList
+import torch.nn.functional as F
+from .attend import Attend
+from torch.utils.checkpoint import checkpoint
+from beartype.typing import Tuple, Optional, List, Callable
+from beartype import beartype
+from rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange, pack, unpack
+from einops.layers.torch import Rearrange
+import torchaudio
+def exists(val):
+    return val is not None
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def l2norm(t):
+    return F.normalize(t, dim=-1, p=2)
+class RMSNorm(Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return F.normalize(x, dim=-1) * self.scale * self.gamma
+class FeedForward(Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        dim_inner = int(dim * mult)
+        self.net = nn.Sequential(
+            RMSNorm(dim),
+            nn.Linear(dim, dim_inner),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_inner, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(Module):
+    def __init__(
+        self,
+        dim,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        rotary_embed=None,
+        flash=True,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        dim_inner = heads * dim_head
+        self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
+        self.to_gates = nn.Linear(dim, heads)
+        self.to_out = nn.Sequential(
+            nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = rearrange(
+            self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
+        )
+        if exists(self.rotary_embed):
+            q = self.rotary_embed.rotate_queries_or_keys(q)
+            k = self.rotary_embed.rotate_queries_or_keys(k)
+        out = self.attend(q, k, v)
+        gates = self.to_gates(x)
+        out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class LinearAttention(Module):
+    @beartype
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head=32,
+        heads=8,
+        scale=8,
+        flash=True,
+        dropout=0.0,
+    ):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Sequential(
+            nn.Linear(dim, dim_inner * 3, bias=False),
+            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
+        )
+        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
+        self.to_out = nn.Sequential(
+            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x)
+        q, k = map(l2norm, (q, k))
+        q = q * self.temperature.exp()
+        out = self.attend(q, k, v)
+        return self.to_out(out)
+class Transformer(Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        ff_mult=4,
+        norm_output=True,
+        rotary_embed=None,
+        flash_attn=True,
+        linear_attn=False,
+    ):
+        super().__init__()
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            if linear_attn:
+                attn = LinearAttention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    flash=flash_attn,
+                )
+            else:
+                attn = Attention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    rotary_embed=rotary_embed,
+                    flash=flash_attn,
+                )
+            self.layers.append(
+                ModuleList(
+                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
+                )
+            )
+        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+class BandSplit(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_features = ModuleList([])
+        for dim_in in dim_inputs:
+            net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
+            self.to_features.append(net)
+    def forward(self, x):
+        x = x.split(self.dim_inputs, dim=-1)
+        outs = []
+        for split_input, to_feature in zip(x, self.to_features):
+            split_output = to_feature(split_input)
+            outs.append(split_output)
+        x = torch.stack(outs, dim=-2)
+        return x
+class Conv(nn.Module):
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.InstanceNorm2d(c2, affine=True, eps=1e-8)
+        self.act = nn.SiLU() if act else nn.Identity()
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+def autopad(k, p=None):
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
+    return p
+class DSConv(nn.Module):
+    def __init__(self, c1, c2, k=3, s=1, p=None, act=True):
+        super().__init__()
+        self.dwconv = nn.Conv2d(c1, c1, k, s, autopad(k, p), groups=c1, bias=False)
+        self.pwconv = nn.Conv2d(c1, c2, 1, 1, 0, bias=False)
+        self.bn = nn.InstanceNorm2d(c2, affine=True, eps=1e-8)
+        self.act = nn.SiLU() if act else nn.Identity()
+    def forward(self, x):
+        return self.act(self.bn(self.pwconv(self.dwconv(x))))
+class DS_Bottleneck(nn.Module):
+    def __init__(self, c1, c2, k=3, shortcut=True):
+        super().__init__()
+        c_ = c1
+        self.dsconv1 = DSConv(c1, c_, k=3, s=1)
+        self.dsconv2 = DSConv(c_, c2, k=k, s=1)
+        self.shortcut = shortcut and c1 == c2
+    def forward(self, x):
+        return (
+            x + self.dsconv2(self.dsconv1(x))
+            if self.shortcut
+            else self.dsconv2(self.dsconv1(x))
+        )
+class DS_C3k(nn.Module):
+    def __init__(self, c1, c2, n=1, k=3, e=0.5):
+        super().__init__()
+        c_ = int(c2 * e)
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1, 1)
+        self.m = nn.Sequential(
+            *[DS_Bottleneck(c_, c_, k=k, shortcut=True) for _ in range(n)]
+        )
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
+class DS_C3k2(nn.Module):
+    def __init__(self, c1, c2, n=1, k=3, e=0.5):
+        super().__init__()
+        c_ = int(c2 * e)
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.m = DS_C3k(c_, c_, n=n, k=k, e=1.0)
+        self.cv2 = Conv(c_, c2, 1, 1)
+    def forward(self, x):
+        x_ = self.cv1(x)
+        x_ = self.m(x_)
+        return self.cv2(x_)
+class AdaptiveHyperedgeGeneration(nn.Module):
+    def __init__(self, in_channels, num_hyperedges, num_heads=8):
+        super().__init__()
+        self.num_hyperedges = num_hyperedges
+        self.num_heads = num_heads
+        self.head_dim = in_channels // num_heads
+        self.global_proto = nn.Parameter(torch.randn(num_hyperedges, in_channels))
+        self.context_mapper = nn.Linear(
+            2 * in_channels, num_hyperedges * in_channels, bias=False
+        )
+        self.query_proj = nn.Linear(in_channels, in_channels, bias=False)
+        self.scale = self.head_dim**-0.5
+    def forward(self, x):
+        B, N, C = x.shape
+        f_avg = F.adaptive_avg_pool1d(x.permute(0, 2, 1), 1).squeeze(-1)
+        f_max = F.adaptive_max_pool1d(x.permute(0, 2, 1), 1).squeeze(-1)
+        f_ctx = torch.cat((f_avg, f_max), dim=1)
+        delta_P = self.context_mapper(f_ctx).view(B, self.num_hyperedges, C)
+        P = self.global_proto.unsqueeze(0) + delta_P
+        z = self.query_proj(x)
+        z = z.view(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        P = P.view(B, self.num_hyperedges, self.num_heads, self.head_dim).permute(
+            0, 2, 3, 1
+        )
+        sim = (z @ P) * self.scale
+        s_bar = sim.mean(dim=1)
+        A = F.softmax(s_bar.permute(0, 2, 1), dim=-1)
+        return A
+class HypergraphConvolution(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.W_e = nn.Linear(in_channels, in_channels, bias=False)
+        self.W_v = nn.Linear(in_channels, out_channels, bias=False)
+        self.act = nn.SiLU()
+    def forward(self, x, A):
+        f_m = torch.bmm(A, x)
+        f_m = self.act(self.W_e(f_m))
+        x_out = torch.bmm(A.transpose(1, 2), f_m)
+        x_out = self.act(self.W_v(x_out))
+        return x + x_out
+class AdaptiveHypergraphComputation(nn.Module):
+    def __init__(self, in_channels, out_channels, num_hyperedges=8, num_heads=8):
+        super().__init__()
+        self.adaptive_hyperedge_gen = AdaptiveHyperedgeGeneration(
+            in_channels, num_hyperedges, num_heads
+        )
+        self.hypergraph_conv = HypergraphConvolution(in_channels, out_channels)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x_flat = x.flatten(2).permute(0, 2, 1)
+        A = self.adaptive_hyperedge_gen(x_flat)
+        x_out_flat = self.hypergraph_conv(x_flat, A)
+        x_out = x_out_flat.permute(0, 2, 1).view(B, -1, H, W)
+        return x_out
+class C3AH(nn.Module):
+    def __init__(self, c1, c2, num_hyperedges=8, num_heads=8, e=0.5):
+        super().__init__()
+        c_ = int(c1 * e)
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.ahc = AdaptiveHypergraphComputation(c_, c_, num_hyperedges, num_heads)
+        self.cv3 = Conv(2 * c_, c2, 1, 1)
+    def forward(self, x):
+        x_lateral = self.cv1(x)
+        x_ahc = self.ahc(self.cv2(x))
+        return self.cv3(torch.cat((x_ahc, x_lateral), dim=1))
+class HyperACE(nn.Module):
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        num_hyperedges=8,
+        num_heads=8,
+        k=2,
+        l=1,
+        c_h=0.5,
+        c_l=0.25,
+    ):
+        super().__init__()
+        c2, c3, c4, c5 = in_channels
+        c_mid = c4
+        self.fuse_conv = Conv(c2 + c3 + c4 + c5, c_mid, 1, 1)
+        self.c_h = int(c_mid * c_h)
+        self.c_l = int(c_mid * c_l)
+        self.c_s = c_mid - self.c_h - self.c_l
+        assert self.c_s > 0, "Channel split error"
+        self.high_order_branch = nn.ModuleList(
+            [
+                C3AH(self.c_h, self.c_h, num_hyperedges, num_heads, e=1.0)
+                for _ in range(k)
+            ]
+        )
+        self.high_order_fuse = Conv(self.c_h * k, self.c_h, 1, 1)
+        self.low_order_branch = nn.Sequential(
+            *[DS_C3k(self.c_l, self.c_l, n=1, k=3, e=1.0) for _ in range(l)]
+        )
+        self.final_fuse = Conv(self.c_h + self.c_l + self.c_s, out_channels, 1, 1)
+    def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
+        B2, B3, B4, B5 = x
+        B, _, H4, W4 = B4.shape
+        B2_resized = F.interpolate(
+            B2, size=(H4, W4), mode="bilinear", align_corners=False
+        )
+        B3_resized = F.interpolate(
+            B3, size=(H4, W4), mode="bilinear", align_corners=False
+        )
+        B5_resized = F.interpolate(
+            B5, size=(H4, W4), mode="bilinear", align_corners=False
+        )
+        x_b = self.fuse_conv(torch.cat((B2_resized, B3_resized, B4, B5_resized), dim=1))
+        x_h, x_l, x_s = torch.split(x_b, [self.c_h, self.c_l, self.c_s], dim=1)
+        x_h_outs = [m(x_h) for m in self.high_order_branch]
+        x_h_fused = self.high_order_fuse(torch.cat(x_h_outs, dim=1))
+        x_l_out = self.low_order_branch(x_l)
+        y = self.final_fuse(torch.cat((x_h_fused, x_l_out, x_s), dim=1))
+        return y
+class GatedFusion(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, in_channels, 1, 1))
+    def forward(self, f_in, h):
+        if f_in.shape[1] != h.shape[1]:
+            raise ValueError(f"Channel mismatch: f_in={f_in.shape}, h={h.shape}")
+        return f_in + self.gamma * h
+class Backbone(nn.Module):
+    def __init__(self, in_channels=256, base_channels=64, base_depth=3):
+        super().__init__()
+        c = base_channels
+        c2 = base_channels
+        c3 = 256
+        c4 = 384
+        c5 = 512
+        c6 = 768
+        self.stem = DSConv(in_channels, c2, k=3, s=(2, 1), p=1)
+        self.p2 = nn.Sequential(
+            DSConv(c2, c3, k=3, s=(2, 1), p=1), DS_C3k2(c3, c3, n=base_depth)
+        )
+        self.p3 = nn.Sequential(
+            DSConv(c3, c4, k=3, s=(2, 1), p=1), DS_C3k2(c4, c4, n=base_depth * 2)
+        )
+        self.p4 = nn.Sequential(
+            DSConv(c4, c5, k=3, s=(2, 1), p=1), DS_C3k2(c5, c5, n=base_depth * 2)
+        )
+        self.p5 = nn.Sequential(
+            DSConv(c5, c6, k=3, s=(2, 1), p=1), DS_C3k2(c6, c6, n=base_depth)
+        )
+        self.out_channels = [c3, c4, c5, c6]
+    def forward(self, x):
+        x = self.stem(x)
+        x2 = self.p2(x)
+        x3 = self.p3(x2)
+        x4 = self.p4(x3)
+        x5 = self.p5(x4)
+        return [x2, x3, x4, x5]
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels: List[int],
+        hyperace_out_c: int,
+        decoder_channels: List[int],
+    ):
+        super().__init__()
+        c_p2, c_p3, c_p4, c_p5 = encoder_channels
+        c_d2, c_d3, c_d4, c_d5 = decoder_channels
+        self.h_to_d5 = Conv(hyperace_out_c, c_d5, 1, 1)
+        self.h_to_d4 = Conv(hyperace_out_c, c_d4, 1, 1)
+        self.h_to_d3 = Conv(hyperace_out_c, c_d3, 1, 1)
+        self.h_to_d2 = Conv(hyperace_out_c, c_d2, 1, 1)
+        self.fusion_d5 = GatedFusion(c_d5)
+        self.fusion_d4 = GatedFusion(c_d4)
+        self.fusion_d3 = GatedFusion(c_d3)
+        self.fusion_d2 = GatedFusion(c_d2)
+        self.skip_p5 = Conv(c_p5, c_d5, 1, 1)
+        self.skip_p4 = Conv(c_p4, c_d4, 1, 1)
+        self.skip_p3 = Conv(c_p3, c_d3, 1, 1)
+        self.skip_p2 = Conv(c_p2, c_d2, 1, 1)
+        self.up_d5 = DS_C3k2(c_d5, c_d4, n=1)
+        self.up_d4 = DS_C3k2(c_d4, c_d3, n=1)
+        self.up_d3 = DS_C3k2(c_d3, c_d2, n=1)
+        self.final_d2 = DS_C3k2(c_d2, c_d2, n=1)
+    def forward(self, enc_feats: List[torch.Tensor], h_ace: torch.Tensor):
+        p2, p3, p4, p5 = enc_feats
+        d5 = self.skip_p5(p5)
+        h_d5 = self.h_to_d5(F.interpolate(h_ace, size=d5.shape[2:], mode="bilinear"))
+        d5 = self.fusion_d5(d5, h_d5)
+        d5_up = F.interpolate(d5, size=p4.shape[2:], mode="bilinear")
+        d4_skip = self.skip_p4(p4)
+        d4 = self.up_d5(d5_up) + d4_skip
+        h_d4 = self.h_to_d4(F.interpolate(h_ace, size=d4.shape[2:], mode="bilinear"))
+        d4 = self.fusion_d4(d4, h_d4)
+        d4_up = F.interpolate(d4, size=p3.shape[2:], mode="bilinear")
+        d3_skip = self.skip_p3(p3)
+        d3 = self.up_d4(d4_up) + d3_skip
+        h_d3 = self.h_to_d3(F.interpolate(h_ace, size=d3.shape[2:], mode="bilinear"))
+        d3 = self.fusion_d3(d3, h_d3)
+        d3_up = F.interpolate(d3, size=p2.shape[2:], mode="bilinear")
+        d2_skip = self.skip_p2(p2)
+        d2 = self.up_d3(d3_up) + d2_skip
+        h_d2 = self.h_to_d2(F.interpolate(h_ace, size=d2.shape[2:], mode="bilinear"))
+        d2 = self.fusion_d2(d2, h_d2)
+        d2_final = self.final_d2(d2)
+        return d2_final
+class FreqPixelShuffle(nn.Module):
+    def __init__(self, in_channels, out_channels, scale=2):
+        super().__init__()
+        self.scale = scale
+        self.conv = DSConv(in_channels, out_channels * scale, k=3, s=1, p=1)
+        self.act = nn.SiLU()
+    def forward(self, x):
+        x = self.conv(x)
+        B, C_r, H, W = x.shape
+        out_c = C_r // self.scale
+        x = x.view(B, out_c, self.scale, H, W)
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x = x.view(B, out_c, H, W * self.scale)
+        return x
+class ProgressiveUpsampleHead(nn.Module):
+    def __init__(self, in_channels, out_channels, target_bins=1025):
+        super().__init__()
+        self.target_bins = target_bins
+        c = in_channels
+        self.block1 = FreqPixelShuffle(c, c, scale=2)
+        self.block2 = FreqPixelShuffle(c, c // 2, scale=2)
+        self.block3 = FreqPixelShuffle(c // 2, c // 2, scale=2)
+        self.block4 = FreqPixelShuffle(c // 2, c // 4, scale=2)
+        self.final_conv = nn.Conv2d(c // 4, out_channels, kernel_size=1, bias=False)
+    def forward(self, x):
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        if x.shape[-1] != self.target_bins:
+            x = F.interpolate(
+                x,
+                size=(x.shape[2], self.target_bins),
+                mode="bilinear",
+                align_corners=False,
+            )
+        x = self.final_conv(x)
+        return x
+class SegmModel(nn.Module):
+    def __init__(
+        self,
+        in_bands=62,
+        in_dim=256,
+        out_bins=1025,
+        out_channels=4,
+        base_channels=64,
+        base_depth=2,
+        num_hyperedges=16,
+        num_heads=8,
+    ):
+        super().__init__()
+        self.backbone = Backbone(
+            in_channels=in_dim, base_channels=base_channels, base_depth=base_depth
+        )
+        enc_channels = self.backbone.out_channels
+        c2, c3, c4, c5 = enc_channels
+        hyperace_in_channels = enc_channels
+        hyperace_out_channels = c4
+        self.hyperace = HyperACE(
+            hyperace_in_channels,
+            hyperace_out_channels,
+            num_hyperedges,
+            num_heads,
+            k=3,
+            l=2,
+        )
+        decoder_channels = [c2, c3, c4, c5]
+        self.decoder = Decoder(enc_channels, hyperace_out_channels, decoder_channels)
+        self.upsample_head = ProgressiveUpsampleHead(
+            in_channels=decoder_channels[0],
+            out_channels=out_channels,
+            target_bins=out_bins,
+        )
+    def forward(self, x):
+        H, W = x.shape[2:]
+        enc_feats = self.backbone(x)
+        h_ace_feats = self.hyperace(enc_feats)
+        dec_feat = self.decoder(enc_feats, h_ace_feats)
+        feat_time_restored = F.interpolate(
+            dec_feat, size=(H, dec_feat.shape[-1]), mode="bilinear", align_corners=False
+        )
+        out = self.upsample_head(feat_time_restored)
+        return out
+def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
+    dim_hidden = default(dim_hidden, dim_in)
+    net = []
+    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+        is_last = ind == (len(dims) - 2)
+        net.append(nn.Linear(layer_dim_in, layer_dim_out))
+        if is_last:
+            continue
+        net.append(activation())
+    return nn.Sequential(*net)
+class MaskEstimator(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_freqs = ModuleList([])
+        dim_hidden = dim * mlp_expansion_factor
+        for dim_in in dim_inputs:
+            net = []
+            mlp = nn.Sequential(
+                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
+            )
+            self.to_freqs.append(mlp)
+        self.segm = SegmModel(
+            in_bands=len(dim_inputs), in_dim=dim, out_bins=sum(dim_inputs) // 4
+        )
+    def forward(self, x):
+        y = rearrange(x, "b t f c -> b c t f")
+        y = self.segm(y)
+        y = rearrange(y, "b c t f -> b t (f c)")
+        x = x.unbind(dim=-2)
+        outs = []
+        for band_features, mlp in zip(x, self.to_freqs):
+            freq_out = mlp(band_features)
+            outs.append(freq_out)
+        return torch.cat(outs, dim=-1) + y
+DEFAULT_FREQS_PER_BANDS = (
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    128,
+    129,
+)
+class BSRoformerHyperACE(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        *,
+        depth,
+        stereo=False,
+        num_stems=1,
+        time_transformer_depth=2,
+        freq_transformer_depth=2,
+        linear_transformer_depth=0,
+        freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        flash_attn=True,
+        dim_freqs_in=1025,
+        stft_n_fft=2048,
+        stft_hop_length=512,
+        stft_win_length=2048,
+        stft_normalized=False,
+        stft_window_fn: Optional[Callable] = None,
+        mask_estimator_depth=2,
+        multi_stft_resolution_loss_weight=1.0,
+        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (
+            4096,
+            2048,
+            1024,
+            512,
+            256,
+        ),
+        multi_stft_hop_size=147,
+        multi_stft_normalized=False,
+        multi_stft_window_fn: Callable = torch.hann_window,
+        mlp_expansion_factor=4,
+        use_torch_checkpoint=False,
+        skip_connection=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.stereo = stereo
+        self.audio_channels = 2 if stereo else 1
+        self.num_stems = num_stems
+        self.use_torch_checkpoint = use_torch_checkpoint
+        self.skip_connection = skip_connection
+        self.layers = ModuleList([])
+        transformer_kwargs = dict(
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            flash_attn=flash_attn,
+            norm_output=False,
+        )
+        time_rotary_embed = RotaryEmbedding(dim=dim_head)
+        freq_rotary_embed = RotaryEmbedding(dim=dim_head)
+        for _ in range(depth):
+            tran_modules = []
+            tran_modules.append(
+                Transformer(
+                    depth=time_transformer_depth,
+                    rotary_embed=time_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            tran_modules.append(
+                Transformer(
+                    depth=freq_transformer_depth,
+                    rotary_embed=freq_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            self.layers.append(nn.ModuleList(tran_modules))
+        self.final_norm = RMSNorm(dim)
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized,
+        )
+        self.stft_window_fn = partial(
+            default(stft_window_fn, torch.hann_window), stft_win_length
+        )
+        freqs = torch.stft(
+            torch.randn(1, 4096),
+            **self.stft_kwargs,
+            window=torch.ones(stft_win_length),
+            return_complex=True,
+        ).shape[1]
+        assert len(freqs_per_bands) > 1
+        assert (
+            sum(freqs_per_bands) == freqs
+        ), f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
+        freqs_per_bands_with_complex = tuple(
+            2 * f * self.audio_channels for f in freqs_per_bands
+        )
+        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
+        self.mask_estimators = nn.ModuleList([])
+        for _ in range(num_stems):
+            mask_estimator = MaskEstimator(
+                dim=dim,
+                dim_inputs=freqs_per_bands_with_complex,
+                depth=mask_estimator_depth,
+                mlp_expansion_factor=mlp_expansion_factor,
+            )
+            self.mask_estimators.append(mask_estimator)
+        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+        self.multi_stft_n_fft = stft_n_fft
+        self.multi_stft_window_fn = multi_stft_window_fn
+        self.multi_stft_kwargs = dict(
+            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
+        )
+    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
+        device = raw_audio.device
+        x_is_mps = True if device.type == "mps" else False
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
+        channels = raw_audio.shape[1]
+        assert (not self.stereo and channels == 1) or (
+            self.stereo and channels == 2
+        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
+        stft_window = self.stft_window_fn(device=device)
+        try:
+            stft_repr = torch.stft(
+                raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
+            )
+        except:
+            stft_repr = torch.stft(
+                raw_audio.cpu() if x_is_mps else raw_audio,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=True,
+            ).to(device)
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
+        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
+        x = rearrange(stft_repr, "b f t c -> b t (f c)")
+        x = self.band_split(x)
+        for i, transformer_block in enumerate(self.layers):
+            time_transformer, freq_transformer = transformer_block
+            x = rearrange(x, "b t f d -> b f t d")
+            x, ps = pack([x], "* t d")
+            x = time_transformer(x)
+            (x,) = unpack(x, ps, "* t d")
+            x = rearrange(x, "b f t d -> b t f d")
+            x, ps = pack([x], "* f d")
+            x = freq_transformer(x)
+            (x,) = unpack(x, ps, "* f d")
+        x = self.final_norm(x)
+        num_stems = len(self.mask_estimators)
+        mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
+        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
+        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
+        stft_repr = torch.view_as_complex(stft_repr)
+        mask = torch.view_as_complex(mask)
+        stft_repr = stft_repr * mask
+        stft_repr = rearrange(
+            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
+        )
+        try:
+            recon_audio = torch.istft(
+                stft_repr,
+                **self.stft_kwargs,
+                window=stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1],
+            )
+        except:
+            recon_audio = torch.istft(
+                stft_repr.cpu() if x_is_mps else stft_repr,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1],
+            ).to(device)
+        recon_audio = rearrange(
+            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems
+        )
+        if num_stems == 1:
+            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
+        if not exists(target):
+            return recon_audio
+        if self.num_stems > 1:
+            assert target.ndim == 4 and target.shape[1] == self.num_stems
+        if target.ndim == 2:
+            target = rearrange(target, "... t -> ... 1 t")
+        target = target[..., : recon_audio.shape[-1]]
+        loss = F.l1_loss(recon_audio, target)
+        multi_stft_resolution_loss = 0.0
+        for window_size in self.multi_stft_resolutions_window_sizes:
+            res_stft_kwargs = dict(
+                n_fft=max(window_size, self.multi_stft_n_fft),
+                win_length=window_size,
+                return_complex=True,
+                window=self.multi_stft_window_fn(window_size, device=device),
+                **self.multi_stft_kwargs,
+            )
+            recon_Y = torch.stft(
+                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            target_Y = torch.stft(
+                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
+                recon_Y, target_Y
+            )
+        weighted_multi_resolution_loss = (
+            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        )
+        total_loss = loss + weighted_multi_resolution_loss
+        if not return_loss_breakdown:
+            return total_loss
+        return total_loss, (loss, multi_stft_resolution_loss)

models/bs_roformer/bs_roformer_hyperace2.py CHANGED Viewed

@@ -1,1166 +1,1147 @@
-from functools import partial
-import torch
-from torch import nn, einsum, Tensor
-from torch.nn import Module, ModuleList
-import torch.nn.functional as F
-from .attend import Attend
-try:
-    from .attend_sage import Attend as AttendSage
-except:
-    pass
-from torch.utils.checkpoint import checkpoint
-from beartype.typing import Tuple, Optional, List, Callable
-from beartype import beartype
-from rotary_embedding_torch import RotaryEmbedding
-from einops import rearrange, pack, unpack
-from einops.layers.torch import Rearrange
-import torchaudio
-def exists(val):
-    return val is not None
-def default(v, d):
-    return v if exists(v) else d
-def pack_one(t, pattern):
-    return pack([t], pattern)
-def unpack_one(t, ps, pattern):
-    return unpack(t, ps, pattern)[0]
-def l2norm(t):
-    return F.normalize(t, dim=-1, p=2)
-class RMSNorm(Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        return F.normalize(x, dim=-1) * self.scale * self.gamma
-class FeedForward(Module):
-    def __init__(self, dim, mult=4, dropout=0.0):
-        super().__init__()
-        dim_inner = int(dim * mult)
-        self.net = nn.Sequential(
-            RMSNorm(dim),
-            nn.Linear(dim, dim_inner),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_inner, dim),
-            nn.Dropout(dropout),
-        )
-    def forward(self, x):
-        return self.net(x)
-class Attention(Module):
-    def __init__(
-        self,
-        dim,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        rotary_embed=None,
-        flash=True,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.heads = heads
-        self.scale = dim_head**-0.5
-        dim_inner = heads * dim_head
-        self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
-        self.to_gates = nn.Linear(dim, heads)
-        self.to_out = nn.Sequential(
-            nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
-        )
-    def forward(self, x):
-        x = self.norm(x)
-        q, k, v = rearrange(
-            self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
-        )
-        if exists(self.rotary_embed):
-            q = self.rotary_embed.rotate_queries_or_keys(q)
-            k = self.rotary_embed.rotate_queries_or_keys(k)
-        out = self.attend(q, k, v)
-        gates = self.to_gates(x)
-        out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()
-        out = rearrange(out, "b h n d -> b n (h d)")
-        return self.to_out(out)
-class LinearAttention(Module):
-    @beartype
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_head=32,
-        heads=8,
-        scale=8,
-        flash=True,
-        dropout=0.0,
-        sage_attention=False,
-    ):
-        super().__init__()
-        dim_inner = dim_head * heads
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Sequential(
-            nn.Linear(dim, dim_inner * 3, bias=False),
-            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
-        )
-        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(scale=scale, dropout=dropout, flash=flash)
-        else:
-            self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
-        self.to_out = nn.Sequential(
-            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
-        )
-    def forward(self, x):
-        x = self.norm(x)
-        q, k, v = self.to_qkv(x)
-        q, k = map(l2norm, (q, k))
-        q = q * self.temperature.exp()
-        out = self.attend(q, k, v)
-        return self.to_out(out)
-class Transformer(Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        depth,
-        dim_head=64,
-        heads=8,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        ff_mult=4,
-        norm_output=True,
-        rotary_embed=None,
-        flash_attn=True,
-        linear_attn=False,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.layers = ModuleList([])
-        for _ in range(depth):
-            if linear_attn:
-                attn = LinearAttention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    flash=flash_attn,
-                    sage_attention=sage_attention,
-                )
-            else:
-                attn = Attention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    rotary_embed=rotary_embed,
-                    flash=flash_attn,
-                    sage_attention=sage_attention,
-                )
-            self.layers.append(
-                ModuleList(
-                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
-                )
-            )
-        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
-    def forward(self, x):
-        for attn, ff in self.layers:
-            x = attn(x) + x
-            x = ff(x) + x
-        return self.norm(x)
-class BandSplit(Module):
-    @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_features = ModuleList([])
-        for dim_in in dim_inputs:
-            net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
-            self.to_features.append(net)
-    def forward(self, x):
-        x = x.split(self.dim_inputs, dim=-1)
-        outs = []
-        for split_input, to_feature in zip(x, self.to_features):
-            split_output = to_feature(split_input)
-            outs.append(split_output)
-        x = torch.stack(outs, dim=-2)
-        return x
-class Conv(nn.Module):
-    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
-        super().__init__()
-        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
-        self.bn = nn.InstanceNorm2d(c2, affine=True, eps=1e-8)
-        self.act = nn.SiLU() if act else nn.Identity()
-    def forward(self, x):
-        return self.act(self.bn(self.conv(x)))
-def autopad(k, p=None):
-    if p is None:
-        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
-    return p
-class DSConv(nn.Module):
-    def __init__(self, c1, c2, k=3, s=1, p=None, act=True):
-        super().__init__()
-        self.dwconv = nn.Conv2d(c1, c1, k, s, autopad(k, p), groups=c1, bias=False)
-        self.pwconv = nn.Conv2d(c1, c2, 1, 1, 0, bias=False)
-        self.bn = nn.InstanceNorm2d(c2, affine=True, eps=1e-8)
-        self.act = nn.SiLU() if act else nn.Identity()
-    def forward(self, x):
-        return self.act(self.bn(self.pwconv(self.dwconv(x))))
-class DS_Bottleneck(nn.Module):
-    def __init__(self, c1, c2, k=3, shortcut=True):
-        super().__init__()
-        c_ = c1
-        self.dsconv1 = DSConv(c1, c_, k=3, s=1)
-        self.dsconv2 = DSConv(c_, c2, k=k, s=1)
-        self.shortcut = shortcut and c1 == c2
-    def forward(self, x):
-        return (
-            x + self.dsconv2(self.dsconv1(x))
-            if self.shortcut
-            else self.dsconv2(self.dsconv1(x))
-        )
-class DS_C3k(nn.Module):
-    def __init__(self, c1, c2, n=1, k=3, e=0.5):
-        super().__init__()
-        c_ = int(c2 * e)
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.cv3 = Conv(2 * c_, c2, 1, 1)
-        self.m = nn.Sequential(
-            *[DS_Bottleneck(c_, c_, k=k, shortcut=True) for _ in range(n)]
-        )
-    def forward(self, x):
-        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
-class DS_C3k2(nn.Module):
-    def __init__(self, c1, c2, n=1, k=3, e=0.5):
-        super().__init__()
-        c_ = int(c2 * e)
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.m = DS_C3k(c_, c_, n=n, k=k, e=1.0)
-        self.cv2 = Conv(c_, c2, 1, 1)
-    def forward(self, x):
-        x_ = self.cv1(x)
-        x_ = self.m(x_)
-        return self.cv2(x_)
-class AdaptiveHyperedgeGeneration(nn.Module):
-    def __init__(self, in_channels, num_hyperedges, num_heads=8):
-        super().__init__()
-        self.num_hyperedges = num_hyperedges
-        self.num_heads = num_heads
-        self.head_dim = in_channels // num_heads
-        self.global_proto = nn.Parameter(torch.randn(num_hyperedges, in_channels))
-        self.context_mapper = nn.Linear(
-            2 * in_channels, num_hyperedges * in_channels, bias=False
-        )
-        self.query_proj = nn.Linear(in_channels, in_channels, bias=False)
-        self.scale = self.head_dim**-0.5
-    def forward(self, x):
-        B, N, C = x.shape
-        f_avg = F.adaptive_avg_pool1d(x.permute(0, 2, 1), 1).squeeze(-1)
-        f_max = F.adaptive_max_pool1d(x.permute(0, 2, 1), 1).squeeze(-1)
-        f_ctx = torch.cat((f_avg, f_max), dim=1)
-        delta_P = self.context_mapper(f_ctx).view(B, self.num_hyperedges, C)
-        P = self.global_proto.unsqueeze(0) + delta_P
-        z = self.query_proj(x)
-        z = z.view(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
-        P = P.view(B, self.num_hyperedges, self.num_heads, self.head_dim).permute(
-            0, 2, 3, 1
-        )
-        sim = (z @ P) * self.scale
-        s_bar = sim.mean(dim=1)
-        A = F.softmax(s_bar.permute(0, 2, 1), dim=-1)
-        return A
-class HypergraphConvolution(nn.Module):
-    def __init__(self, in_channels, out_channels):
-        super().__init__()
-        self.W_e = nn.Linear(in_channels, in_channels, bias=False)
-        self.W_v = nn.Linear(in_channels, out_channels, bias=False)
-        self.act = nn.SiLU()
-    def forward(self, x, A):
-        f_m = torch.bmm(A, x)
-        f_m = self.act(self.W_e(f_m))
-        x_out = torch.bmm(A.transpose(1, 2), f_m)
-        x_out = self.act(self.W_v(x_out))
-        return x + x_out
-class AdaptiveHypergraphComputation(nn.Module):
-    def __init__(self, in_channels, out_channels, num_hyperedges=8, num_heads=8):
-        super().__init__()
-        self.adaptive_hyperedge_gen = AdaptiveHyperedgeGeneration(
-            in_channels, num_hyperedges, num_heads
-        )
-        self.hypergraph_conv = HypergraphConvolution(in_channels, out_channels)
-    def forward(self, x):
-        B, C, H, W = x.shape
-        x_flat = x.flatten(2).permute(0, 2, 1)
-        A = self.adaptive_hyperedge_gen(x_flat)
-        x_out_flat = self.hypergraph_conv(x_flat, A)
-        x_out = x_out_flat.permute(0, 2, 1).view(B, -1, H, W)
-        return x_out
-class C3AH(nn.Module):
-    def __init__(self, c1, c2, num_hyperedges=8, num_heads=8, e=0.5):
-        super().__init__()
-        c_ = int(c1 * e)
-        self.cv1 = Conv(c1, c_, 1, 1)
-        self.cv2 = Conv(c1, c_, 1, 1)
-        self.ahc = AdaptiveHypergraphComputation(c_, c_, num_hyperedges, num_heads)
-        self.cv3 = Conv(2 * c_, c2, 1, 1)
-    def forward(self, x):
-        x_lateral = self.cv1(x)
-        x_ahc = self.ahc(self.cv2(x))
-        return self.cv3(torch.cat((x_ahc, x_lateral), dim=1))
-class HyperACE(nn.Module):
-    def __init__(
-        self,
-        in_channels: List[int],
-        out_channels: int,
-        num_hyperedges=8,
-        num_heads=8,
-        k=2,
-        l=1,
-        c_h=0.5,
-        c_l=0.25,
-    ):
-        super().__init__()
-        c2, c3, c4, c5 = in_channels
-        c_mid = c4
-        self.fuse_conv = Conv(c2 + c3 + c4 + c5, c_mid, 1, 1)
-        self.c_h = int(c_mid * c_h)
-        self.c_l = int(c_mid * c_l)
-        self.c_s = c_mid - self.c_h - self.c_l
-        assert self.c_s > 0, "Channel split error"
-        self.high_order_branch = nn.ModuleList(
-            [
-                C3AH(self.c_h, self.c_h, num_hyperedges, num_heads, e=1.0)
-                for _ in range(k)
-            ]
-        )
-        self.high_order_fuse = Conv(self.c_h * k, self.c_h, 1, 1)
-        self.low_order_branch = nn.Sequential(
-            *[DS_C3k(self.c_l, self.c_l, n=1, k=3, e=1.0) for _ in range(l)]
-        )
-        self.final_fuse = Conv(self.c_h + self.c_l + self.c_s, out_channels, 1, 1)
-    def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
-        B2, B3, B4, B5 = x
-        B, _, H4, W4 = B4.shape
-        B2_resized = F.interpolate(
-            B2, size=(H4, W4), mode="bilinear", align_corners=False
-        )
-        B3_resized = F.interpolate(
-            B3, size=(H4, W4), mode="bilinear", align_corners=False
-        )
-        B5_resized = F.interpolate(
-            B5, size=(H4, W4), mode="bilinear", align_corners=False
-        )
-        x_b = self.fuse_conv(torch.cat((B2_resized, B3_resized, B4, B5_resized), dim=1))
-        x_h, x_l, x_s = torch.split(x_b, [self.c_h, self.c_l, self.c_s], dim=1)
-        x_h_outs = [m(x_h) for m in self.high_order_branch]
-        x_h_fused = self.high_order_fuse(torch.cat(x_h_outs, dim=1))
-        x_l_out = self.low_order_branch(x_l)
-        y = self.final_fuse(torch.cat((x_h_fused, x_l_out, x_s), dim=1))
-        return y
-class GatedFusion(nn.Module):
-    def __init__(self, in_channels):
-        super().__init__()
-        self.gamma = nn.Parameter(torch.zeros(1, in_channels, 1, 1))
-    def forward(self, f_in, h):
-        if f_in.shape[1] != h.shape[1]:
-            raise ValueError(f"Channel mismatch: f_in={f_in.shape}, h={h.shape}")
-        return f_in + self.gamma * h
-class Backbone(nn.Module):
-    def __init__(self, in_channels=256, base_channels=64, base_depth=3):
-        super().__init__()
-        c = base_channels
-        c2 = base_channels
-        c3 = 256
-        c4 = 384
-        c5 = 512
-        c6 = 768
-        self.stem = DSConv(in_channels, c2, k=3, s=(2, 1), p=1)
-        self.p2 = nn.Sequential(
-            DSConv(c2, c3, k=3, s=(2, 1), p=1), DS_C3k2(c3, c3, n=base_depth)
-        )
-        self.p3 = nn.Sequential(
-            DSConv(c3, c4, k=3, s=(2, 1), p=1), DS_C3k2(c4, c4, n=base_depth * 2)
-        )
-        self.p4 = nn.Sequential(
-            DSConv(c4, c5, k=3, s=2, p=1), DS_C3k2(c5, c5, n=base_depth * 2)
-        )
-        self.p5 = nn.Sequential(
-            DSConv(c5, c6, k=3, s=2, p=1), DS_C3k2(c6, c6, n=base_depth)
-        )
-        self.out_channels = [c3, c4, c5, c6]
-    def forward(self, x):
-        x = self.stem(x)
-        x2 = self.p2(x)
-        x3 = self.p3(x2)
-        x4 = self.p4(x3)
-        x5 = self.p5(x4)
-        return [x2, x3, x4, x5]
-class Decoder(nn.Module):
-    def __init__(
-        self,
-        encoder_channels: List[int],
-        hyperace_out_c: int,
-        decoder_channels: List[int],
-    ):
-        super().__init__()
-        c_p2, c_p3, c_p4, c_p5 = encoder_channels
-        c_d2, c_d3, c_d4, c_d5 = decoder_channels
-        self.h_to_d5 = Conv(hyperace_out_c, c_d5, 1, 1)
-        self.h_to_d4 = Conv(hyperace_out_c, c_d4, 1, 1)
-        self.h_to_d3 = Conv(hyperace_out_c, c_d3, 1, 1)
-        self.h_to_d2 = Conv(hyperace_out_c, c_d2, 1, 1)
-        self.fusion_d5 = GatedFusion(c_d5)
-        self.fusion_d4 = GatedFusion(c_d4)
-        self.fusion_d3 = GatedFusion(c_d3)
-        self.fusion_d2 = GatedFusion(c_d2)
-        self.skip_p5 = Conv(c_p5, c_d5, 1, 1)
-        self.skip_p4 = Conv(c_p4, c_d4, 1, 1)
-        self.skip_p3 = Conv(c_p3, c_d3, 1, 1)
-        self.skip_p2 = Conv(c_p2, c_d2, 1, 1)
-        self.up_d5 = DS_C3k2(c_d5, c_d4, n=1)
-        self.up_d4 = DS_C3k2(c_d4, c_d3, n=1)
-        self.up_d3 = DS_C3k2(c_d3, c_d2, n=1)
-        self.final_d2 = DS_C3k2(c_d2, c_d2, n=1)
-    def forward(self, enc_feats: List[torch.Tensor], h_ace: torch.Tensor):
-        p2, p3, p4, p5 = enc_feats
-        d5 = self.skip_p5(p5)
-        h_d5 = self.h_to_d5(F.interpolate(h_ace, size=d5.shape[2:], mode="bilinear"))
-        d5 = self.fusion_d5(d5, h_d5)
-        d5_up = F.interpolate(d5, size=p4.shape[2:], mode="bilinear")
-        d4_skip = self.skip_p4(p4)
-        d4 = self.up_d5(d5_up) + d4_skip
-        h_d4 = self.h_to_d4(F.interpolate(h_ace, size=d4.shape[2:], mode="bilinear"))
-        d4 = self.fusion_d4(d4, h_d4)
-        d4_up = F.interpolate(d4, size=p3.shape[2:], mode="bilinear")
-        d3_skip = self.skip_p3(p3)
-        d3 = self.up_d4(d4_up) + d3_skip
-        h_d3 = self.h_to_d3(F.interpolate(h_ace, size=d3.shape[2:], mode="bilinear"))
-        d3 = self.fusion_d3(d3, h_d3)
-        d3_up = F.interpolate(d3, size=p2.shape[2:], mode="bilinear")
-        d2_skip = self.skip_p2(p2)
-        d2 = self.up_d3(d3_up) + d2_skip
-        h_d2 = self.h_to_d2(F.interpolate(h_ace, size=d2.shape[2:], mode="bilinear"))
-        d2 = self.fusion_d2(d2, h_d2)
-        d2_final = self.final_d2(d2)
-        return d2_final
-class TFC_TDF(nn.Module):
-    def __init__(self, in_c, c, l, f, bn=4):
-        super().__init__()
-        self.blocks = nn.ModuleList()
-        for i in range(l):
-            block = nn.Module()
-            block.tfc1 = nn.Sequential(
-                nn.InstanceNorm2d(in_c, affine=True, eps=1e-8),
-                nn.SiLU(),
-                nn.Conv2d(in_c, c, 3, 1, 1, bias=False),
-            )
-            block.tdf = nn.Sequential(
-                nn.InstanceNorm2d(c, affine=True, eps=1e-8),
-                nn.SiLU(),
-                nn.Linear(f, f // bn, bias=False),
-                nn.InstanceNorm2d(c, affine=True, eps=1e-8),
-                nn.SiLU(),
-                nn.Linear(f // bn, f, bias=False),
-            )
-            block.tfc2 = nn.Sequential(
-                nn.InstanceNorm2d(c, affine=True, eps=1e-8),
-                nn.SiLU(),
-                nn.Conv2d(c, c, 3, 1, 1, bias=False),
-            )
-            block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False)
-            self.blocks.append(block)
-            in_c = c
-    def forward(self, x):
-        for block in self.blocks:
-            s = block.shortcut(x)
-            x = block.tfc1(x)
-            x = x + block.tdf(x)
-            x = block.tfc2(x)
-            x = x + s
-        return x
-class FreqPixelShuffle(nn.Module):
-    def __init__(self, in_channels, out_channels, scale, f):
-        super().__init__()
-        self.scale = scale
-        self.conv = DSConv(in_channels, out_channels * scale)
-        self.out_conv = TFC_TDF(out_channels, out_channels, 2, f)
-    def forward(self, x):
-        x = self.conv(x)
-        B, C_r, H, W = x.shape
-        out_c = C_r // self.scale
-        x = x.view(B, out_c, self.scale, H, W)
-        x = x.permute(0, 1, 3, 4, 2).contiguous()
-        x = x.view(B, out_c, H, W * self.scale)
-        return self.out_conv(x)
-class ProgressiveUpsampleHead(nn.Module):
-    def __init__(self, in_channels, out_channels, target_bins=1025, in_bands=62):
-        super().__init__()
-        self.target_bins = target_bins
-        c = in_channels
-        self.block1 = FreqPixelShuffle(c, c // 2, scale=2, f=in_bands * 2)
-        self.block2 = FreqPixelShuffle(c // 2, c // 4, scale=2, f=in_bands * 4)
-        self.block3 = FreqPixelShuffle(c // 4, c // 8, scale=2, f=in_bands * 8)
-        self.block4 = FreqPixelShuffle(c // 8, c // 16, scale=2, f=in_bands * 16)
-        self.final_conv = nn.Conv2d(
-            c // 16, out_channels, kernel_size=3, stride=1, padding="same", bias=False
-        )
-    def forward(self, x):
-        x = self.block1(x)
-        x = self.block2(x)
-        x = self.block3(x)
-        x = self.block4(x)
-        if x.shape[-1] != self.target_bins:
-            x = F.interpolate(
-                x,
-                size=(x.shape[2], self.target_bins),
-                mode="bilinear",
-                align_corners=False,
-            )
-        x = self.final_conv(x)
-        return x
-class SegmModel(nn.Module):
-    def __init__(
-        self,
-        in_bands=62,
-        in_dim=256,
-        out_bins=1025,
-        out_channels=4,
-        base_channels=64,
-        base_depth=2,
-        num_hyperedges=32,
-        num_heads=8,
-    ):
-        super().__init__()
-        self.backbone = Backbone(
-            in_channels=in_dim, base_channels=base_channels, base_depth=base_depth
-        )
-        enc_channels = self.backbone.out_channels
-        c2, c3, c4, c5 = enc_channels
-        hyperace_in_channels = enc_channels
-        hyperace_out_channels = c4
-        self.hyperace = HyperACE(
-            hyperace_in_channels,
-            hyperace_out_channels,
-            num_hyperedges,
-            num_heads,
-            k=2,
-            l=1,
-        )
-        decoder_channels = [c2, c3, c4, c5]
-        self.decoder = Decoder(enc_channels, hyperace_out_channels, decoder_channels)
-        self.upsample_head = ProgressiveUpsampleHead(
-            in_channels=decoder_channels[0],
-            out_channels=out_channels,
-            target_bins=out_bins,
-            in_bands=in_bands,
-        )
-    def forward(self, x):
-        H, W = x.shape[2:]
-        enc_feats = self.backbone(x)
-        h_ace_feats = self.hyperace(enc_feats)
-        dec_feat = self.decoder(enc_feats, h_ace_feats)
-        feat_time_restored = F.interpolate(
-            dec_feat, size=(H, dec_feat.shape[-1]), mode="bilinear", align_corners=False
-        )
-        out = self.upsample_head(feat_time_restored)
-        return out
-def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
-    dim_hidden = default(dim_hidden, dim_in)
-    net = []
-    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
-    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
-        is_last = ind == (len(dims) - 2)
-        net.append(nn.Linear(layer_dim_in, layer_dim_out))
-        if is_last:
-            continue
-        net.append(activation())
-    return nn.Sequential(*net)
-class MaskEstimator(Module):
-    @beartype
-    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_freqs = ModuleList([])
-        dim_hidden = dim * mlp_expansion_factor
-        for dim_in in dim_inputs:
-            net = []
-            mlp = nn.Sequential(
-                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
-            )
-            self.to_freqs.append(mlp)
-        self.segm = SegmModel(
-            in_bands=len(dim_inputs), in_dim=dim, out_bins=sum(dim_inputs) // 4
-        )
-    def forward(self, x):
-        y = rearrange(x, "b t f c -> b c t f")
-        y = self.segm(y)
-        y = rearrange(y, "b c t f -> b t (f c)")
-        x = x.unbind(dim=-2)
-        outs = []
-        for band_features, mlp in zip(x, self.to_freqs):
-            freq_out = mlp(band_features)
-            outs.append(freq_out)
-        return torch.cat(outs, dim=-1) + y
-DEFAULT_FREQS_PER_BANDS = (
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    2,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    4,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    12,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    24,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    48,
-    128,
-    129,
-)
-class BSRoformerHyperACE_2(Module):
-    @beartype
-    def __init__(
-        self,
-        dim,
-        *,
-        depth,
-        stereo=False,
-        num_stems=1,
-        time_transformer_depth=2,
-        freq_transformer_depth=2,
-        linear_transformer_depth=0,
-        freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
-        dim_head=64,
-        heads=8,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        flash_attn=True,
-        dim_freqs_in=1025,
-        stft_n_fft=2048,
-        stft_hop_length=512,
-        stft_win_length=2048,
-        stft_normalized=False,
-        stft_window_fn: Optional[Callable] = None,
-        mask_estimator_depth=2,
-        multi_stft_resolution_loss_weight=1.0,
-        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (
-            4096,
-            2048,
-            1024,
-            512,
-            256,
-        ),
-        multi_stft_hop_size=147,
-        multi_stft_normalized=False,
-        multi_stft_window_fn: Callable = torch.hann_window,
-        mlp_expansion_factor=4,
-        use_torch_checkpoint=False,
-        skip_connection=False,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.stereo = stereo
-        self.audio_channels = 2 if stereo else 1
-        self.num_stems = num_stems
-        self.use_torch_checkpoint = use_torch_checkpoint
-        self.skip_connection = skip_connection
-        self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
-        transformer_kwargs = dict(
-            dim=dim,
-            heads=heads,
-            dim_head=dim_head,
-            attn_dropout=attn_dropout,
-            ff_dropout=ff_dropout,
-            flash_attn=flash_attn,
-            norm_output=False,
-            sage_attention=sage_attention,
-        )
-        time_rotary_embed = RotaryEmbedding(dim=dim_head)
-        freq_rotary_embed = RotaryEmbedding(dim=dim_head)
-        for _ in range(depth):
-            tran_modules = []
-            tran_modules.append(
-                Transformer(
-                    depth=time_transformer_depth,
-                    rotary_embed=time_rotary_embed,
-                    **transformer_kwargs,
-                )
-            )
-            tran_modules.append(
-                Transformer(
-                    depth=freq_transformer_depth,
-                    rotary_embed=freq_rotary_embed,
-                    **transformer_kwargs,
-                )
-            )
-            self.layers.append(nn.ModuleList(tran_modules))
-        self.final_norm = RMSNorm(dim)
-        self.stft_kwargs = dict(
-            n_fft=stft_n_fft,
-            hop_length=stft_hop_length,
-            win_length=stft_win_length,
-            normalized=stft_normalized,
-        )
-        self.stft_window_fn = partial(
-            default(stft_window_fn, torch.hann_window), stft_win_length
-        )
-        freqs = torch.stft(
-            torch.randn(1, 4096),
-            **self.stft_kwargs,
-            window=torch.ones(stft_win_length),
-            return_complex=True,
-        ).shape[1]
-        assert len(freqs_per_bands) > 1
-        assert (
-            sum(freqs_per_bands) == freqs
-        ), f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
-        freqs_per_bands_with_complex = tuple(
-            2 * f * self.audio_channels for f in freqs_per_bands
-        )
-        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
-        self.mask_estimators = nn.ModuleList([])
-        for _ in range(num_stems):
-            mask_estimator = MaskEstimator(
-                dim=dim,
-                dim_inputs=freqs_per_bands_with_complex,
-                depth=mask_estimator_depth,
-                mlp_expansion_factor=mlp_expansion_factor,
-            )
-            self.mask_estimators.append(mask_estimator)
-        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
-        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
-        self.multi_stft_n_fft = stft_n_fft
-        self.multi_stft_window_fn = multi_stft_window_fn
-        self.multi_stft_kwargs = dict(
-            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
-        )
-    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
-        device = raw_audio.device
-        x_is_mps = True if device.type == "mps" else False
-        if raw_audio.ndim == 2:
-            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
-        channels = raw_audio.shape[1]
-        assert (not self.stereo and channels == 1) or (
-            self.stereo and channels == 2
-        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
-        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
-        stft_window = self.stft_window_fn(device=device)
-        try:
-            stft_repr = torch.stft(
-                raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
-            )
-        except:
-            stft_repr = torch.stft(
-                raw_audio.cpu() if x_is_mps else raw_audio,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=True,
-            ).to(device)
-        stft_repr = torch.view_as_real(stft_repr)
-        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
-        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
-        x = rearrange(stft_repr, "b f t c -> b t (f c)")
-        x = self.band_split(x)
-        for i, transformer_block in enumerate(self.layers):
-            time_transformer, freq_transformer = transformer_block
-            x = rearrange(x, "b t f d -> b f t d")
-            x, ps = pack([x], "* t d")
-            x = time_transformer(x)
-            (x,) = unpack(x, ps, "* t d")
-            x = rearrange(x, "b f t d -> b t f d")
-            x, ps = pack([x], "* f d")
-            x = freq_transformer(x)
-            (x,) = unpack(x, ps, "* f d")
-        x = self.final_norm(x)
-        num_stems = len(self.mask_estimators)
-        mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
-        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
-        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
-        stft_repr = torch.view_as_complex(stft_repr)
-        mask = torch.view_as_complex(mask)
-        stft_repr = stft_repr * mask
-        stft_repr = rearrange(
-            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
-        )
-        try:
-            recon_audio = torch.istft(
-                stft_repr,
-                **self.stft_kwargs,
-                window=stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1],
-            )
-        except:
-            recon_audio = torch.istft(
-                stft_repr.cpu() if x_is_mps else stft_repr,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1],
-            ).to(device)
-        recon_audio = rearrange(
-            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems
-        )
-        if num_stems == 1:
-            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
-        if not exists(target):
-            return recon_audio
-        if self.num_stems > 1:
-            assert target.ndim == 4 and target.shape[1] == self.num_stems
-        if target.ndim == 2:
-            target = rearrange(target, "... t -> ... 1 t")
-        target = target[..., : recon_audio.shape[-1]]
-        loss = F.l1_loss(recon_audio, target)
-        multi_stft_resolution_loss = 0.0
-        for window_size in self.multi_stft_resolutions_window_sizes:
-            res_stft_kwargs = dict(
-                n_fft=max(window_size, self.multi_stft_n_fft),
-                win_length=window_size,
-                return_complex=True,
-                window=self.multi_stft_window_fn(window_size, device=device),
-                **self.multi_stft_kwargs,
-            )
-            recon_Y = torch.stft(
-                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
-            )
-            target_Y = torch.stft(
-                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
-            )
-            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
-                recon_Y, target_Y
-            )
-        weighted_multi_resolution_loss = (
-            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
-        )
-        total_loss = loss + weighted_multi_resolution_loss
-        if not return_loss_breakdown:
-            return total_loss
-        return total_loss, (loss, multi_stft_resolution_loss)

+from functools import partial
+import torch
+from torch import nn, einsum, Tensor
+from torch.nn import Module, ModuleList
+import torch.nn.functional as F
+from .attend import Attend
+from torch.utils.checkpoint import checkpoint
+from beartype.typing import Tuple, Optional, List, Callable
+from beartype import beartype
+from rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange, pack, unpack
+from einops.layers.torch import Rearrange
+import torchaudio
+def exists(val):
+    return val is not None
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def l2norm(t):
+    return F.normalize(t, dim=-1, p=2)
+class RMSNorm(Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return F.normalize(x, dim=-1) * self.scale * self.gamma
+class FeedForward(Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        dim_inner = int(dim * mult)
+        self.net = nn.Sequential(
+            RMSNorm(dim),
+            nn.Linear(dim, dim_inner),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_inner, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(Module):
+    def __init__(
+        self,
+        dim,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        rotary_embed=None,
+        flash=True,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        dim_inner = heads * dim_head
+        self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
+        self.to_gates = nn.Linear(dim, heads)
+        self.to_out = nn.Sequential(
+            nn.Linear(dim_inner, dim, bias=False), nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = rearrange(
+            self.to_qkv(x), "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads
+        )
+        if exists(self.rotary_embed):
+            q = self.rotary_embed.rotate_queries_or_keys(q)
+            k = self.rotary_embed.rotate_queries_or_keys(k)
+        out = self.attend(q, k, v)
+        gates = self.to_gates(x)
+        out = out * rearrange(gates, "b n h -> b h n 1").sigmoid()
+        out = rearrange(out, "b h n d -> b n (h d)")
+        return self.to_out(out)
+class LinearAttention(Module):
+    @beartype
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head=32,
+        heads=8,
+        scale=8,
+        flash=True,
+        dropout=0.0,
+    ):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Sequential(
+            nn.Linear(dim, dim_inner * 3, bias=False),
+            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
+        )
+        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
+        self.to_out = nn.Sequential(
+            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x)
+        q, k = map(l2norm, (q, k))
+        q = q * self.temperature.exp()
+        out = self.attend(q, k, v)
+        return self.to_out(out)
+class Transformer(Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        ff_mult=4,
+        norm_output=True,
+        rotary_embed=None,
+        flash_attn=True,
+        linear_attn=False,
+    ):
+        super().__init__()
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            if linear_attn:
+                attn = LinearAttention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    flash=flash_attn,
+                )
+            else:
+                attn = Attention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    rotary_embed=rotary_embed,
+                    flash=flash_attn,
+                )
+            self.layers.append(
+                ModuleList(
+                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
+                )
+            )
+        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+class BandSplit(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...]):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_features = ModuleList([])
+        for dim_in in dim_inputs:
+            net = nn.Sequential(RMSNorm(dim_in), nn.Linear(dim_in, dim))
+            self.to_features.append(net)
+    def forward(self, x):
+        x = x.split(self.dim_inputs, dim=-1)
+        outs = []
+        for split_input, to_feature in zip(x, self.to_features):
+            split_output = to_feature(split_input)
+            outs.append(split_output)
+        x = torch.stack(outs, dim=-2)
+        return x
+class Conv(nn.Module):
+    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
+        super().__init__()
+        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
+        self.bn = nn.InstanceNorm2d(c2, affine=True, eps=1e-8)
+        self.act = nn.SiLU() if act else nn.Identity()
+    def forward(self, x):
+        return self.act(self.bn(self.conv(x)))
+def autopad(k, p=None):
+    if p is None:
+        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]
+    return p
+class DSConv(nn.Module):
+    def __init__(self, c1, c2, k=3, s=1, p=None, act=True):
+        super().__init__()
+        self.dwconv = nn.Conv2d(c1, c1, k, s, autopad(k, p), groups=c1, bias=False)
+        self.pwconv = nn.Conv2d(c1, c2, 1, 1, 0, bias=False)
+        self.bn = nn.InstanceNorm2d(c2, affine=True, eps=1e-8)
+        self.act = nn.SiLU() if act else nn.Identity()
+    def forward(self, x):
+        return self.act(self.bn(self.pwconv(self.dwconv(x))))
+class DS_Bottleneck(nn.Module):
+    def __init__(self, c1, c2, k=3, shortcut=True):
+        super().__init__()
+        c_ = c1
+        self.dsconv1 = DSConv(c1, c_, k=3, s=1)
+        self.dsconv2 = DSConv(c_, c2, k=k, s=1)
+        self.shortcut = shortcut and c1 == c2
+    def forward(self, x):
+        return (
+            x + self.dsconv2(self.dsconv1(x))
+            if self.shortcut
+            else self.dsconv2(self.dsconv1(x))
+        )
+class DS_C3k(nn.Module):
+    def __init__(self, c1, c2, n=1, k=3, e=0.5):
+        super().__init__()
+        c_ = int(c2 * e)
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.cv3 = Conv(2 * c_, c2, 1, 1)
+        self.m = nn.Sequential(
+            *[DS_Bottleneck(c_, c_, k=k, shortcut=True) for _ in range(n)]
+        )
+    def forward(self, x):
+        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), dim=1))
+class DS_C3k2(nn.Module):
+    def __init__(self, c1, c2, n=1, k=3, e=0.5):
+        super().__init__()
+        c_ = int(c2 * e)
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.m = DS_C3k(c_, c_, n=n, k=k, e=1.0)
+        self.cv2 = Conv(c_, c2, 1, 1)
+    def forward(self, x):
+        x_ = self.cv1(x)
+        x_ = self.m(x_)
+        return self.cv2(x_)
+class AdaptiveHyperedgeGeneration(nn.Module):
+    def __init__(self, in_channels, num_hyperedges, num_heads=8):
+        super().__init__()
+        self.num_hyperedges = num_hyperedges
+        self.num_heads = num_heads
+        self.head_dim = in_channels // num_heads
+        self.global_proto = nn.Parameter(torch.randn(num_hyperedges, in_channels))
+        self.context_mapper = nn.Linear(
+            2 * in_channels, num_hyperedges * in_channels, bias=False
+        )
+        self.query_proj = nn.Linear(in_channels, in_channels, bias=False)
+        self.scale = self.head_dim**-0.5
+    def forward(self, x):
+        B, N, C = x.shape
+        f_avg = F.adaptive_avg_pool1d(x.permute(0, 2, 1), 1).squeeze(-1)
+        f_max = F.adaptive_max_pool1d(x.permute(0, 2, 1), 1).squeeze(-1)
+        f_ctx = torch.cat((f_avg, f_max), dim=1)
+        delta_P = self.context_mapper(f_ctx).view(B, self.num_hyperedges, C)
+        P = self.global_proto.unsqueeze(0) + delta_P
+        z = self.query_proj(x)
+        z = z.view(B, N, self.num_heads, self.head_dim).permute(0, 2, 1, 3)
+        P = P.view(B, self.num_hyperedges, self.num_heads, self.head_dim).permute(
+            0, 2, 3, 1
+        )
+        sim = (z @ P) * self.scale
+        s_bar = sim.mean(dim=1)
+        A = F.softmax(s_bar.permute(0, 2, 1), dim=-1)
+        return A
+class HypergraphConvolution(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super().__init__()
+        self.W_e = nn.Linear(in_channels, in_channels, bias=False)
+        self.W_v = nn.Linear(in_channels, out_channels, bias=False)
+        self.act = nn.SiLU()
+    def forward(self, x, A):
+        f_m = torch.bmm(A, x)
+        f_m = self.act(self.W_e(f_m))
+        x_out = torch.bmm(A.transpose(1, 2), f_m)
+        x_out = self.act(self.W_v(x_out))
+        return x + x_out
+class AdaptiveHypergraphComputation(nn.Module):
+    def __init__(self, in_channels, out_channels, num_hyperedges=8, num_heads=8):
+        super().__init__()
+        self.adaptive_hyperedge_gen = AdaptiveHyperedgeGeneration(
+            in_channels, num_hyperedges, num_heads
+        )
+        self.hypergraph_conv = HypergraphConvolution(in_channels, out_channels)
+    def forward(self, x):
+        B, C, H, W = x.shape
+        x_flat = x.flatten(2).permute(0, 2, 1)
+        A = self.adaptive_hyperedge_gen(x_flat)
+        x_out_flat = self.hypergraph_conv(x_flat, A)
+        x_out = x_out_flat.permute(0, 2, 1).view(B, -1, H, W)
+        return x_out
+class C3AH(nn.Module):
+    def __init__(self, c1, c2, num_hyperedges=8, num_heads=8, e=0.5):
+        super().__init__()
+        c_ = int(c1 * e)
+        self.cv1 = Conv(c1, c_, 1, 1)
+        self.cv2 = Conv(c1, c_, 1, 1)
+        self.ahc = AdaptiveHypergraphComputation(c_, c_, num_hyperedges, num_heads)
+        self.cv3 = Conv(2 * c_, c2, 1, 1)
+    def forward(self, x):
+        x_lateral = self.cv1(x)
+        x_ahc = self.ahc(self.cv2(x))
+        return self.cv3(torch.cat((x_ahc, x_lateral), dim=1))
+class HyperACE(nn.Module):
+    def __init__(
+        self,
+        in_channels: List[int],
+        out_channels: int,
+        num_hyperedges=8,
+        num_heads=8,
+        k=2,
+        l=1,
+        c_h=0.5,
+        c_l=0.25,
+    ):
+        super().__init__()
+        c2, c3, c4, c5 = in_channels
+        c_mid = c4
+        self.fuse_conv = Conv(c2 + c3 + c4 + c5, c_mid, 1, 1)
+        self.c_h = int(c_mid * c_h)
+        self.c_l = int(c_mid * c_l)
+        self.c_s = c_mid - self.c_h - self.c_l
+        assert self.c_s > 0, "Channel split error"
+        self.high_order_branch = nn.ModuleList(
+            [
+                C3AH(self.c_h, self.c_h, num_hyperedges, num_heads, e=1.0)
+                for _ in range(k)
+            ]
+        )
+        self.high_order_fuse = Conv(self.c_h * k, self.c_h, 1, 1)
+        self.low_order_branch = nn.Sequential(
+            *[DS_C3k(self.c_l, self.c_l, n=1, k=3, e=1.0) for _ in range(l)]
+        )
+        self.final_fuse = Conv(self.c_h + self.c_l + self.c_s, out_channels, 1, 1)
+    def forward(self, x: List[torch.Tensor]) -> torch.Tensor:
+        B2, B3, B4, B5 = x
+        B, _, H4, W4 = B4.shape
+        B2_resized = F.interpolate(
+            B2, size=(H4, W4), mode="bilinear", align_corners=False
+        )
+        B3_resized = F.interpolate(
+            B3, size=(H4, W4), mode="bilinear", align_corners=False
+        )
+        B5_resized = F.interpolate(
+            B5, size=(H4, W4), mode="bilinear", align_corners=False
+        )
+        x_b = self.fuse_conv(torch.cat((B2_resized, B3_resized, B4, B5_resized), dim=1))
+        x_h, x_l, x_s = torch.split(x_b, [self.c_h, self.c_l, self.c_s], dim=1)
+        x_h_outs = [m(x_h) for m in self.high_order_branch]
+        x_h_fused = self.high_order_fuse(torch.cat(x_h_outs, dim=1))
+        x_l_out = self.low_order_branch(x_l)
+        y = self.final_fuse(torch.cat((x_h_fused, x_l_out, x_s), dim=1))
+        return y
+class GatedFusion(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.gamma = nn.Parameter(torch.zeros(1, in_channels, 1, 1))
+    def forward(self, f_in, h):
+        if f_in.shape[1] != h.shape[1]:
+            raise ValueError(f"Channel mismatch: f_in={f_in.shape}, h={h.shape}")
+        return f_in + self.gamma * h
+class Backbone(nn.Module):
+    def __init__(self, in_channels=256, base_channels=64, base_depth=3):
+        super().__init__()
+        c = base_channels
+        c2 = base_channels
+        c3 = 256
+        c4 = 384
+        c5 = 512
+        c6 = 768
+        self.stem = DSConv(in_channels, c2, k=3, s=(2, 1), p=1)
+        self.p2 = nn.Sequential(
+            DSConv(c2, c3, k=3, s=(2, 1), p=1), DS_C3k2(c3, c3, n=base_depth)
+        )
+        self.p3 = nn.Sequential(
+            DSConv(c3, c4, k=3, s=(2, 1), p=1), DS_C3k2(c4, c4, n=base_depth * 2)
+        )
+        self.p4 = nn.Sequential(
+            DSConv(c4, c5, k=3, s=2, p=1), DS_C3k2(c5, c5, n=base_depth * 2)
+        )
+        self.p5 = nn.Sequential(
+            DSConv(c5, c6, k=3, s=2, p=1), DS_C3k2(c6, c6, n=base_depth)
+        )
+        self.out_channels = [c3, c4, c5, c6]
+    def forward(self, x):
+        x = self.stem(x)
+        x2 = self.p2(x)
+        x3 = self.p3(x2)
+        x4 = self.p4(x3)
+        x5 = self.p5(x4)
+        return [x2, x3, x4, x5]
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        encoder_channels: List[int],
+        hyperace_out_c: int,
+        decoder_channels: List[int],
+    ):
+        super().__init__()
+        c_p2, c_p3, c_p4, c_p5 = encoder_channels
+        c_d2, c_d3, c_d4, c_d5 = decoder_channels
+        self.h_to_d5 = Conv(hyperace_out_c, c_d5, 1, 1)
+        self.h_to_d4 = Conv(hyperace_out_c, c_d4, 1, 1)
+        self.h_to_d3 = Conv(hyperace_out_c, c_d3, 1, 1)
+        self.h_to_d2 = Conv(hyperace_out_c, c_d2, 1, 1)
+        self.fusion_d5 = GatedFusion(c_d5)
+        self.fusion_d4 = GatedFusion(c_d4)
+        self.fusion_d3 = GatedFusion(c_d3)
+        self.fusion_d2 = GatedFusion(c_d2)
+        self.skip_p5 = Conv(c_p5, c_d5, 1, 1)
+        self.skip_p4 = Conv(c_p4, c_d4, 1, 1)
+        self.skip_p3 = Conv(c_p3, c_d3, 1, 1)
+        self.skip_p2 = Conv(c_p2, c_d2, 1, 1)
+        self.up_d5 = DS_C3k2(c_d5, c_d4, n=1)
+        self.up_d4 = DS_C3k2(c_d4, c_d3, n=1)
+        self.up_d3 = DS_C3k2(c_d3, c_d2, n=1)
+        self.final_d2 = DS_C3k2(c_d2, c_d2, n=1)
+    def forward(self, enc_feats: List[torch.Tensor], h_ace: torch.Tensor):
+        p2, p3, p4, p5 = enc_feats
+        d5 = self.skip_p5(p5)
+        h_d5 = self.h_to_d5(F.interpolate(h_ace, size=d5.shape[2:], mode="bilinear"))
+        d5 = self.fusion_d5(d5, h_d5)
+        d5_up = F.interpolate(d5, size=p4.shape[2:], mode="bilinear")
+        d4_skip = self.skip_p4(p4)
+        d4 = self.up_d5(d5_up) + d4_skip
+        h_d4 = self.h_to_d4(F.interpolate(h_ace, size=d4.shape[2:], mode="bilinear"))
+        d4 = self.fusion_d4(d4, h_d4)
+        d4_up = F.interpolate(d4, size=p3.shape[2:], mode="bilinear")
+        d3_skip = self.skip_p3(p3)
+        d3 = self.up_d4(d4_up) + d3_skip
+        h_d3 = self.h_to_d3(F.interpolate(h_ace, size=d3.shape[2:], mode="bilinear"))
+        d3 = self.fusion_d3(d3, h_d3)
+        d3_up = F.interpolate(d3, size=p2.shape[2:], mode="bilinear")
+        d2_skip = self.skip_p2(p2)
+        d2 = self.up_d3(d3_up) + d2_skip
+        h_d2 = self.h_to_d2(F.interpolate(h_ace, size=d2.shape[2:], mode="bilinear"))
+        d2 = self.fusion_d2(d2, h_d2)
+        d2_final = self.final_d2(d2)
+        return d2_final
+class TFC_TDF(nn.Module):
+    def __init__(self, in_c, c, l, f, bn=4):
+        super().__init__()
+        self.blocks = nn.ModuleList()
+        for i in range(l):
+            block = nn.Module()
+            block.tfc1 = nn.Sequential(
+                nn.InstanceNorm2d(in_c, affine=True, eps=1e-8),
+                nn.SiLU(),
+                nn.Conv2d(in_c, c, 3, 1, 1, bias=False),
+            )
+            block.tdf = nn.Sequential(
+                nn.InstanceNorm2d(c, affine=True, eps=1e-8),
+                nn.SiLU(),
+                nn.Linear(f, f // bn, bias=False),
+                nn.InstanceNorm2d(c, affine=True, eps=1e-8),
+                nn.SiLU(),
+                nn.Linear(f // bn, f, bias=False),
+            )
+            block.tfc2 = nn.Sequential(
+                nn.InstanceNorm2d(c, affine=True, eps=1e-8),
+                nn.SiLU(),
+                nn.Conv2d(c, c, 3, 1, 1, bias=False),
+            )
+            block.shortcut = nn.Conv2d(in_c, c, 1, 1, 0, bias=False)
+            self.blocks.append(block)
+            in_c = c
+    def forward(self, x):
+        for block in self.blocks:
+            s = block.shortcut(x)
+            x = block.tfc1(x)
+            x = x + block.tdf(x)
+            x = block.tfc2(x)
+            x = x + s
+        return x
+class FreqPixelShuffle(nn.Module):
+    def __init__(self, in_channels, out_channels, scale, f):
+        super().__init__()
+        self.scale = scale
+        self.conv = DSConv(in_channels, out_channels * scale)
+        self.out_conv = TFC_TDF(out_channels, out_channels, 2, f)
+    def forward(self, x):
+        x = self.conv(x)
+        B, C_r, H, W = x.shape
+        out_c = C_r // self.scale
+        x = x.view(B, out_c, self.scale, H, W)
+        x = x.permute(0, 1, 3, 4, 2).contiguous()
+        x = x.view(B, out_c, H, W * self.scale)
+        return self.out_conv(x)
+class ProgressiveUpsampleHead(nn.Module):
+    def __init__(self, in_channels, out_channels, target_bins=1025, in_bands=62):
+        super().__init__()
+        self.target_bins = target_bins
+        c = in_channels
+        self.block1 = FreqPixelShuffle(c, c // 2, scale=2, f=in_bands * 2)
+        self.block2 = FreqPixelShuffle(c // 2, c // 4, scale=2, f=in_bands * 4)
+        self.block3 = FreqPixelShuffle(c // 4, c // 8, scale=2, f=in_bands * 8)
+        self.block4 = FreqPixelShuffle(c // 8, c // 16, scale=2, f=in_bands * 16)
+        self.final_conv = nn.Conv2d(
+            c // 16, out_channels, kernel_size=3, stride=1, padding="same", bias=False
+        )
+    def forward(self, x):
+        x = self.block1(x)
+        x = self.block2(x)
+        x = self.block3(x)
+        x = self.block4(x)
+        if x.shape[-1] != self.target_bins:
+            x = F.interpolate(
+                x,
+                size=(x.shape[2], self.target_bins),
+                mode="bilinear",
+                align_corners=False,
+            )
+        x = self.final_conv(x)
+        return x
+class SegmModel(nn.Module):
+    def __init__(
+        self,
+        in_bands=62,
+        in_dim=256,
+        out_bins=1025,
+        out_channels=4,
+        base_channels=64,
+        base_depth=2,
+        num_hyperedges=32,
+        num_heads=8,
+    ):
+        super().__init__()
+        self.backbone = Backbone(
+            in_channels=in_dim, base_channels=base_channels, base_depth=base_depth
+        )
+        enc_channels = self.backbone.out_channels
+        c2, c3, c4, c5 = enc_channels
+        hyperace_in_channels = enc_channels
+        hyperace_out_channels = c4
+        self.hyperace = HyperACE(
+            hyperace_in_channels,
+            hyperace_out_channels,
+            num_hyperedges,
+            num_heads,
+            k=2,
+            l=1,
+        )
+        decoder_channels = [c2, c3, c4, c5]
+        self.decoder = Decoder(enc_channels, hyperace_out_channels, decoder_channels)
+        self.upsample_head = ProgressiveUpsampleHead(
+            in_channels=decoder_channels[0],
+            out_channels=out_channels,
+            target_bins=out_bins,
+            in_bands=in_bands,
+        )
+    def forward(self, x):
+        H, W = x.shape[2:]
+        enc_feats = self.backbone(x)
+        h_ace_feats = self.hyperace(enc_feats)
+        dec_feat = self.decoder(enc_feats, h_ace_feats)
+        feat_time_restored = F.interpolate(
+            dec_feat, size=(H, dec_feat.shape[-1]), mode="bilinear", align_corners=False
+        )
+        out = self.upsample_head(feat_time_restored)
+        return out
+def MLP(dim_in, dim_out, dim_hidden=None, depth=1, activation=nn.Tanh):
+    dim_hidden = default(dim_hidden, dim_in)
+    net = []
+    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+        is_last = ind == (len(dims) - 2)
+        net.append(nn.Linear(layer_dim_in, layer_dim_out))
+        if is_last:
+            continue
+        net.append(activation())
+    return nn.Sequential(*net)
+class MaskEstimator(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: Tuple[int, ...], depth, mlp_expansion_factor=4):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_freqs = ModuleList([])
+        dim_hidden = dim * mlp_expansion_factor
+        for dim_in in dim_inputs:
+            net = []
+            mlp = nn.Sequential(
+                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
+            )
+            self.to_freqs.append(mlp)
+        self.segm = SegmModel(
+            in_bands=len(dim_inputs), in_dim=dim, out_bins=sum(dim_inputs) // 4
+        )
+    def forward(self, x):
+        y = rearrange(x, "b t f c -> b c t f")
+        y = self.segm(y)
+        y = rearrange(y, "b c t f -> b t (f c)")
+        x = x.unbind(dim=-2)
+        outs = []
+        for band_features, mlp in zip(x, self.to_freqs):
+            freq_out = mlp(band_features)
+            outs.append(freq_out)
+        return torch.cat(outs, dim=-1) + y
+DEFAULT_FREQS_PER_BANDS = (
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    2,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    4,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    12,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    24,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    48,
+    128,
+    129,
+)
+class BSRoformerHyperACE_2(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        *,
+        depth,
+        stereo=False,
+        num_stems=1,
+        time_transformer_depth=2,
+        freq_transformer_depth=2,
+        linear_transformer_depth=0,
+        freqs_per_bands: Tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        flash_attn=True,
+        dim_freqs_in=1025,
+        stft_n_fft=2048,
+        stft_hop_length=512,
+        stft_win_length=2048,
+        stft_normalized=False,
+        stft_window_fn: Optional[Callable] = None,
+        mask_estimator_depth=2,
+        multi_stft_resolution_loss_weight=1.0,
+        multi_stft_resolutions_window_sizes: Tuple[int, ...] = (
+            4096,
+            2048,
+            1024,
+            512,
+            256,
+        ),
+        multi_stft_hop_size=147,
+        multi_stft_normalized=False,
+        multi_stft_window_fn: Callable = torch.hann_window,
+        mlp_expansion_factor=4,
+        use_torch_checkpoint=False,
+        skip_connection=False,
+        **kwargs
+    ):
+        super().__init__()
+        self.stereo = stereo
+        self.audio_channels = 2 if stereo else 1
+        self.num_stems = num_stems
+        self.use_torch_checkpoint = use_torch_checkpoint
+        self.skip_connection = skip_connection
+        self.layers = ModuleList([])
+        transformer_kwargs = dict(
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            flash_attn=flash_attn,
+            norm_output=False,
+        )
+        time_rotary_embed = RotaryEmbedding(dim=dim_head)
+        freq_rotary_embed = RotaryEmbedding(dim=dim_head)
+        for _ in range(depth):
+            tran_modules = []
+            tran_modules.append(
+                Transformer(
+                    depth=time_transformer_depth,
+                    rotary_embed=time_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            tran_modules.append(
+                Transformer(
+                    depth=freq_transformer_depth,
+                    rotary_embed=freq_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            self.layers.append(nn.ModuleList(tran_modules))
+        self.final_norm = RMSNorm(dim)
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized,
+        )
+        self.stft_window_fn = partial(
+            default(stft_window_fn, torch.hann_window), stft_win_length
+        )
+        freqs = torch.stft(
+            torch.randn(1, 4096),
+            **self.stft_kwargs,
+            window=torch.ones(stft_win_length),
+            return_complex=True,
+        ).shape[1]
+        assert len(freqs_per_bands) > 1
+        assert (
+            sum(freqs_per_bands) == freqs
+        ), f"the number of freqs in the bands must equal {freqs} based on the STFT settings, but got {sum(freqs_per_bands)}"
+        freqs_per_bands_with_complex = tuple(
+            2 * f * self.audio_channels for f in freqs_per_bands
+        )
+        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
+        self.mask_estimators = nn.ModuleList([])
+        for _ in range(num_stems):
+            mask_estimator = MaskEstimator(
+                dim=dim,
+                dim_inputs=freqs_per_bands_with_complex,
+                depth=mask_estimator_depth,
+                mlp_expansion_factor=mlp_expansion_factor,
+            )
+            self.mask_estimators.append(mask_estimator)
+        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+        self.multi_stft_n_fft = stft_n_fft
+        self.multi_stft_window_fn = multi_stft_window_fn
+        self.multi_stft_kwargs = dict(
+            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
+        )
+    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
+        device = raw_audio.device
+        x_is_mps = True if device.type == "mps" else False
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
+        channels = raw_audio.shape[1]
+        assert (not self.stereo and channels == 1) or (
+            self.stereo and channels == 2
+        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, "* t")
+        stft_window = self.stft_window_fn(device=device)
+        try:
+            stft_repr = torch.stft(
+                raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
+            )
+        except:
+            stft_repr = torch.stft(
+                raw_audio.cpu() if x_is_mps else raw_audio,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=True,
+            ).to(device)
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, "* f t c")
+        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
+        x = rearrange(stft_repr, "b f t c -> b t (f c)")
+        x = self.band_split(x)
+        for i, transformer_block in enumerate(self.layers):
+            time_transformer, freq_transformer = transformer_block
+            x = rearrange(x, "b t f d -> b f t d")
+            x, ps = pack([x], "* t d")
+            x = time_transformer(x)
+            (x,) = unpack(x, ps, "* t d")
+            x = rearrange(x, "b f t d -> b t f d")
+            x, ps = pack([x], "* f d")
+            x = freq_transformer(x)
+            (x,) = unpack(x, ps, "* f d")
+        x = self.final_norm(x)
+        num_stems = len(self.mask_estimators)
+        mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
+        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
+        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
+        stft_repr = torch.view_as_complex(stft_repr)
+        mask = torch.view_as_complex(mask)
+        stft_repr = stft_repr * mask
+        stft_repr = rearrange(
+            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
+        )
+        try:
+            recon_audio = torch.istft(
+                stft_repr,
+                **self.stft_kwargs,
+                window=stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1],
+            )
+        except:
+            recon_audio = torch.istft(
+                stft_repr.cpu() if x_is_mps else stft_repr,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1],
+            ).to(device)
+        recon_audio = rearrange(
+            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems
+        )
+        if num_stems == 1:
+            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
+        if not exists(target):
+            return recon_audio
+        if self.num_stems > 1:
+            assert target.ndim == 4 and target.shape[1] == self.num_stems
+        if target.ndim == 2:
+            target = rearrange(target, "... t -> ... 1 t")
+        target = target[..., : recon_audio.shape[-1]]
+        loss = F.l1_loss(recon_audio, target)
+        multi_stft_resolution_loss = 0.0
+        for window_size in self.multi_stft_resolutions_window_sizes:
+            res_stft_kwargs = dict(
+                n_fft=max(window_size, self.multi_stft_n_fft),
+                win_length=window_size,
+                return_complex=True,
+                window=self.multi_stft_window_fn(window_size, device=device),
+                **self.multi_stft_kwargs,
+            )
+            recon_Y = torch.stft(
+                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            target_Y = torch.stft(
+                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
+                recon_Y, target_Y
+            )
+        weighted_multi_resolution_loss = (
+            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        )
+        total_loss = loss + weighted_multi_resolution_loss
+        if not return_loss_breakdown:
+            return total_loss
+        return total_loss, (loss, multi_stft_resolution_loss)

models/bs_roformer/bs_roformer_sw.py CHANGED Viewed

@@ -1,676 +1,657 @@
-from __future__ import annotations
-from functools import partial
-import torch
-import torch.nn.functional as F
-from beartype import beartype
-from beartype.typing import Callable
-from einops import pack, rearrange, unpack
-from einops.layers.torch import Rearrange
-from torch import nn
-from torch.nn import Module, ModuleList
-from torch.utils.checkpoint import checkpoint
-from .attend import Attend
-try:
-    from .attend_sage import AttendSage
-except ImportError:
-    pass
-def l2norm(t):
-    return F.normalize(t, dim=-1, p=2)
-class CustomNorm(Module):
-    def __init__(self, dim, eps: float = 5.960464477539063e-08):
-        super().__init__()
-        self.scale = dim**0.5
-        self.gamma = nn.Parameter(torch.ones(dim))
-        self.eps = eps
-    def forward(self, x):
-        l2_norm = torch.linalg.norm(x, dim=-1, keepdim=True)
-        denom = torch.maximum(l2_norm, torch.full_like(l2_norm, self.eps))
-        normalized_x = x / denom
-        return normalized_x * self.scale * self.gamma
-class RotaryEmbedding(nn.Module):
-    def __init__(self, cos_emb, sin_emb):
-        super().__init__()
-        self.cos_emb = cos_emb
-        self.sin_emb = sin_emb
-    def rotate_half(self, x):
-        x = rearrange(x, "... (d r) -> ... d r", r=2)
-        x1, x2 = x.unbind(dim=-1)
-        x = torch.stack((-x2, x1), dim=-1)
-        return rearrange(x, "... d r -> ... (d r)")
-    def forward(self, x):
-        cos_b = self.cos_emb.unsqueeze(0).unsqueeze(0).to(x.device, x.dtype)
-        sin_b = self.sin_emb.unsqueeze(0).unsqueeze(0).to(x.device, x.dtype)
-        term1 = x * cos_b
-        term2 = self.rotate_half(x) * sin_b
-        sum = term1.to(torch.float32) + term2.to(torch.float32)
-        return sum.to(x.dtype)
-class FeedForward(Module):
-    def __init__(self, dim, mult=4, dropout=0.0):
-        super().__init__()
-        dim_inner = int(dim * mult)
-        self.net = nn.Sequential(
-            CustomNorm(dim),
-            nn.Linear(dim, dim_inner),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_inner, dim),
-            nn.Dropout(dropout),
-        )
-    def forward(self, x):
-        return self.net(x)
-class Attention(Module):
-    def __init__(
-        self,
-        dim,
-        heads=8,
-        dim_head=64,
-        dropout=0.0,
-        shared_qkv_bias=None,
-        shared_out_bias=None,
-        rotary_embed: RotaryEmbedding | None = None,
-        flash=True,
-        sage_attention=False,
-    ):
-        super().__init__()
-        self.heads = heads
-        self.scale = dim_head**-0.5
-        dim_inner = heads * dim_head
-        self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)  # type: ignore
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)  # type: ignore
-        self.norm = CustomNorm(dim)
-        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=(shared_qkv_bias is not None))
-        if shared_qkv_bias is not None:
-            self.to_qkv.bias = shared_qkv_bias
-        self.to_gates = nn.Linear(dim, heads)
-        self.to_out = nn.Sequential(
-            nn.Linear(dim_inner, dim, bias=(shared_out_bias is not None)),
-            nn.Dropout(dropout),
-        )
-        if shared_out_bias is not None:
-            self.to_out[0].bias = shared_out_bias
-    def forward(self, x):
-        x = self.norm(x)
-        qkv = self.to_qkv(x)
-        q, k, v = rearrange(qkv, "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads)
-        if self.rotary_embed is not None:
-            q = self.rotary_embed(q)
-            k = self.rotary_embed(k)
-        out = self.attend(q, k, v)
-        gates = self.to_gates(x)
-        gate_act = gates.sigmoid()
-        out = out * rearrange(gate_act, "b n h -> b h n 1")
-        out = rearrange(out, "b h n d -> b n (h d)")
-        out = self.to_out(out)
-        return out
-class LinearAttention(Module):
-    @beartype
-    def __init__(
-        self,
-        *,
-        dim,
-        dim_head=32,
-        heads=8,
-        scale=8,
-        flash=True,
-        dropout=0.0,
-        sage_attention=False,
-    ):
-        super().__init__()
-        dim_inner = dim_head * heads
-        self.norm = CustomNorm(dim)
-        self.to_qkv = nn.Sequential(
-            nn.Linear(dim, dim_inner * 3, bias=False),
-            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
-        )
-        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(scale=scale, dropout=dropout, flash=flash)  # type: ignore
-        else:
-            self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
-        self.to_out = nn.Sequential(
-            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
-        )
-    def forward(self, x):
-        x = self.norm(x)
-        q, k, v = self.to_qkv(x)
-        q, k = map(l2norm, (q, k))
-        q = q * self.temperature.exp()
-        out = self.attend(q, k, v)
-        return self.to_out(out)
-class Transformer(Module):
-    def __init__(
-        self,
-        *,
-        dim,
-        depth,
-        dim_head=64,
-        heads=8,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        ff_mult=4,
-        norm_output=True,
-        rotary_embed: RotaryEmbedding | None = None,
-        flash_attn=True,
-        linear_attn=False,
-        sage_attention=False,
-        shared_qkv_bias=None,
-        shared_out_bias=None,
-    ):
-        super().__init__()
-        self.layers = ModuleList([])
-        for _ in range(depth):
-            attn: LinearAttention | Attention
-            if linear_attn:
-                attn = LinearAttention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    flash=flash_attn,
-                    sage_attention=sage_attention,
-                )
-            else:
-                attn = Attention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    shared_qkv_bias=shared_qkv_bias,
-                    shared_out_bias=shared_out_bias,
-                    rotary_embed=rotary_embed,
-                    flash=flash_attn,
-                    sage_attention=sage_attention,
-                )
-            self.layers.append(
-                ModuleList(
-                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
-                )
-            )
-        self.norm = CustomNorm(dim) if norm_output else nn.Identity()
-    def forward(self, x):
-        for attn, ff in self.layers:  # type: ignore
-            x = attn(x) + x
-            x = ff(x) + x
-        return self.norm(x)
-class BandSplit(Module):
-    @beartype
-    def __init__(self, dim, dim_inputs: tuple[int, ...]):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_features = ModuleList([])
-        for dim_in in dim_inputs:
-            net = nn.Sequential(CustomNorm(dim_in), nn.Linear(dim_in, dim))
-            self.to_features.append(net)
-    def forward(self, x):
-        x = x.split(self.dim_inputs, dim=-1)
-        outs = []
-        for split_input, to_feature in zip(x, self.to_features):
-            split_output = to_feature(split_input)
-            outs.append(split_output)
-        return torch.stack(outs, dim=-2)
-def MLP(
-    dim_in: int,
-    dim_out: int,
-    dim_hidden: int | None = None,
-    depth: int = 1,
-    activation=nn.Tanh,
-):
-    dim_hidden = dim_hidden or dim_in
-    net = []
-    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
-    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
-        is_last = ind == (len(dims) - 2)
-        net.append(nn.Linear(layer_dim_in, layer_dim_out))
-        if is_last:
-            continue
-        net.append(activation())
-    return nn.Sequential(*net)
-class MaskEstimator(Module):
-    @beartype
-    def __init__(self, dim, dim_inputs: tuple[int, ...], depth, mlp_expansion_factor=4):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_freqs = ModuleList([])
-        dim_hidden = dim * mlp_expansion_factor
-        for dim_in in dim_inputs:
-            mlp = nn.Sequential(
-                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
-            )
-            self.to_freqs.append(mlp)
-    def forward(self, x):
-        x = x.unbind(dim=-2)
-        outs = []
-        for band_features, mlp in zip(x, self.to_freqs):
-            freq_out = mlp(band_features)
-            outs.append(freq_out)
-        return torch.cat(outs, dim=-1)
-# fmt: off
-DEFAULT_FREQS_PER_BANDS = (
-    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
-    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-    12, 12, 12, 12, 12, 12, 12, 12,
-    24, 24, 24, 24, 24, 24, 24, 24,
-    48, 48, 48, 48, 48, 48, 48, 48,
-    128, 129
-)
-# fmt: on
-class BSRoformer_SW(Module):
-    @beartype
-    def __init__(
-        self,
-        dim,
-        *,
-        depth,
-        stereo=False,
-        num_stems=1,
-        time_transformer_depth=2,
-        freq_transformer_depth=2,
-        linear_transformer_depth=0,
-        freqs_per_bands: tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
-        dim_head=64,
-        heads=8,
-        attn_dropout=0.0,
-        ff_dropout=0.0,
-        flash_attn=True,
-        stft_n_fft=2048,
-        stft_hop_length=512,
-        stft_win_length=2048,
-        stft_normalized=False,
-        stft_window_fn: Callable | None = None,
-        mask_estimator_depth=2,
-        multi_stft_resolution_loss_weight=1.0,
-        multi_stft_resolutions_window_sizes: tuple[int, ...] = (
-            4096,
-            2048,
-            1024,
-            512,
-            256,
-        ),
-        multi_stft_hop_size=147,
-        multi_stft_normalized=False,
-        multi_stft_window_fn: Callable = torch.hann_window,
-        mlp_expansion_factor=4,
-        use_torch_checkpoint=False,
-        skip_connection=False,
-        sage_attention=False,
-        use_shared_bias=False,
-        chunk_size: int = 588800,
-    ):
-        super().__init__()
-        self.stereo = stereo
-        self.audio_channels = 2 if stereo else 1
-        self.num_stems = num_stems
-        self.use_torch_checkpoint = use_torch_checkpoint
-        self.skip_connection = skip_connection
-        self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
-        if use_shared_bias:
-            dim_inner = heads * dim_head
-            self.shared_qkv_bias = nn.Parameter(torch.ones(dim_inner * 3))
-            self.shared_out_bias = nn.Parameter(torch.ones(dim))
-        transformer_kwargs = dict(
-            dim=dim,
-            heads=heads,
-            dim_head=dim_head,
-            attn_dropout=attn_dropout,
-            ff_dropout=ff_dropout,
-            flash_attn=flash_attn,
-            norm_output=False,
-            sage_attention=sage_attention,
-            shared_qkv_bias=self.shared_qkv_bias,
-            shared_out_bias=self.shared_out_bias,
-        )
-        t_frames = chunk_size // stft_hop_length + 1
-        self.cos_emb_time = nn.Parameter(torch.zeros(t_frames, dim_head))
-        self.sin_emb_time = nn.Parameter(torch.zeros(t_frames, dim_head))
-        time_rotary_embed = RotaryEmbedding(
-            cos_emb=self.cos_emb_time, sin_emb=self.sin_emb_time
-        )
-        num_bands = len(freqs_per_bands)
-        self.cos_emb_freq = nn.Parameter(torch.zeros(num_bands, dim_head))
-        self.sin_emb_freq = nn.Parameter(torch.zeros(num_bands, dim_head))
-        freq_rotary_embed = RotaryEmbedding(
-            cos_emb=self.cos_emb_freq, sin_emb=self.sin_emb_freq
-        )
-        for _ in range(depth):
-            tran_modules = []
-            if linear_transformer_depth > 0:
-                tran_modules.append(
-                    Transformer(
-                        depth=linear_transformer_depth,
-                        linear_attn=True,
-                        **transformer_kwargs,
-                    )
-                )
-            tran_modules.append(
-                Transformer(
-                    depth=time_transformer_depth,
-                    rotary_embed=time_rotary_embed,
-                    **transformer_kwargs,
-                )
-            )
-            tran_modules.append(
-                Transformer(
-                    depth=freq_transformer_depth,
-                    rotary_embed=freq_rotary_embed,
-                    **transformer_kwargs,
-                )
-            )
-            self.layers.append(nn.ModuleList(tran_modules))
-        self.final_norm = CustomNorm(dim)
-        self.stft_kwargs = dict(
-            n_fft=stft_n_fft,
-            hop_length=stft_hop_length,
-            win_length=stft_win_length,
-            normalized=stft_normalized,
-        )
-        self.stft_window_fn = partial(
-            stft_window_fn or torch.hann_window, stft_win_length
-        )
-        freqs_per_bands_with_complex = tuple(
-            2 * f * self.audio_channels for f in freqs_per_bands
-        )
-        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
-        self.mask_estimators = nn.ModuleList([])
-        for _ in range(num_stems):
-            mask_estimator = MaskEstimator(
-                dim=dim,
-                dim_inputs=freqs_per_bands_with_complex,
-                depth=mask_estimator_depth,
-                mlp_expansion_factor=mlp_expansion_factor,
-            )
-            self.mask_estimators.append(mask_estimator)
-        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
-        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
-        self.multi_stft_n_fft = stft_n_fft
-        self.multi_stft_window_fn = multi_stft_window_fn
-        self.multi_stft_kwargs = dict(
-            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
-        )
-    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
-        device = raw_audio.device
-        x_is_mps = True if device.type == "mps" else False
-        if raw_audio.ndim == 2:
-            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
-        channels = raw_audio.shape[1]
-        assert (not self.stereo and channels == 1) or (
-            self.stereo and channels == 2
-        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
-        raw_audio, batch_audio_channel_packed_shape = pack([raw_audio], "* t")
-        stft_window = self.stft_window_fn(device=device)
-        try:
-            stft_repr = torch.stft(
-                raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
-            )
-        except Exception:
-            stft_repr = torch.stft(
-                raw_audio.cpu() if x_is_mps else raw_audio,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=True,
-            ).to(device)
-        stft_repr = torch.view_as_real(stft_repr)
-        stft_repr = unpack(stft_repr, batch_audio_channel_packed_shape, "* f t c")[0]
-        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
-        x = rearrange(stft_repr, "b f t c -> b t (f c)")
-        if torch.isnan(x).any() or torch.isinf(x).any():
-            raise RuntimeError(
-                f"NaN/Inf in x after stft: {x.isnan().sum()} NaNs, {x.isinf().sum()} Infs"
-            )
-        if self.use_torch_checkpoint:
-            x = checkpoint(self.band_split, x, use_reentrant=False)
-        else:
-            x = self.band_split(x)
-        if torch.isnan(x).any() or torch.isinf(x).any():
-            raise RuntimeError(
-                f"NaN/Inf in x after band_split: {x.isnan().sum()} NaNs, {x.isinf().sum()} Infs"
-            )
-        store = [None] * len(self.layers)
-        for i, transformer_block in enumerate(self.layers):
-            if len(transformer_block) == 3:
-                linear_transformer, time_transformer, freq_transformer = (
-                    transformer_block
-                )
-                x, ft_ps = pack([x], "b * d")
-                if self.use_torch_checkpoint:
-                    x = checkpoint(linear_transformer, x, use_reentrant=False)
-                else:
-                    x = linear_transformer(x)
-                (x,) = unpack(x, ft_ps, "b * d")
-            else:
-                time_transformer, freq_transformer = transformer_block
-            if self.skip_connection:
-                for j in range(i):
-                    x = x + store[j]
-            x = rearrange(x, "b t f d -> b f t d")
-            x, ps = pack([x], "* t d")
-            if self.use_torch_checkpoint:
-                x = checkpoint(time_transformer, x, use_reentrant=False)
-            else:
-                x = time_transformer(x)
-            (x,) = unpack(x, ps, "* t d")
-            x = rearrange(x, "b f t d -> b t f d")
-            x, ps = pack([x], "* f d")
-            if self.use_torch_checkpoint:
-                x = checkpoint(freq_transformer, x, use_reentrant=False)
-            else:
-                x = freq_transformer(x)
-            (x,) = unpack(x, ps, "* f d")
-            if self.skip_connection:
-                store[i] = x
-        x = self.final_norm(x)
-        num_stems = len(self.mask_estimators)
-        if self.use_torch_checkpoint:
-            mask = torch.stack(
-                [checkpoint(fn, x, use_reentrant=False) for fn in self.mask_estimators],
-                dim=1,
-            )
-        else:
-            mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
-        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
-        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
-        stft_repr = torch.view_as_complex(stft_repr)
-        mask = torch.view_as_complex(mask)
-        stft_repr = stft_repr * mask
-        stft_repr = rearrange(
-            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
-        )
-        try:
-            recon_audio = torch.istft(
-                stft_repr,
-                **self.stft_kwargs,
-                window=stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1],
-            )
-        except Exception:
-            recon_audio = torch.istft(
-                stft_repr.cpu() if x_is_mps else stft_repr,
-                **self.stft_kwargs,
-                window=stft_window.cpu() if x_is_mps else stft_window,
-                return_complex=False,
-                length=raw_audio.shape[-1],
-            ).to(device)
-        recon_audio = rearrange(
-            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems
-        )
-        if num_stems == 1:
-            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
-        if target is None:
-            return recon_audio
-        if self.num_stems > 1:
-            assert target.ndim == 4 and target.shape[1] == self.num_stems
-        if target.ndim == 2:
-            target = rearrange(target, "... t -> ... 1 t")
-        target = target[..., : recon_audio.shape[-1]]
-        loss = F.l1_loss(recon_audio, target)
-        multi_stft_resolution_loss = 0.0
-        for window_size in self.multi_stft_resolutions_window_sizes:
-            res_stft_kwargs = dict(
-                n_fft=max(window_size, self.multi_stft_n_fft),
-                win_length=window_size,
-                return_complex=True,
-                window=self.multi_stft_window_fn(window_size, device=device),
-                **self.multi_stft_kwargs,
-            )
-            recon_Y = torch.stft(
-                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
-            )
-            target_Y = torch.stft(
-                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
-            )
-            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
-                recon_Y, target_Y
-            )
-        weighted_multi_resolution_loss = (
-            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
-        )
-        total_loss = loss + weighted_multi_resolution_loss
-        if not return_loss_breakdown:
-            return total_loss
-        return total_loss, (loss, multi_stft_resolution_loss)

+from __future__ import annotations
+from functools import partial
+import torch
+import torch.nn.functional as F
+from beartype import beartype
+from beartype.typing import Callable
+from einops import pack, rearrange, unpack
+from einops.layers.torch import Rearrange
+from torch import nn
+from torch.nn import Module, ModuleList
+from torch.utils.checkpoint import checkpoint
+from .attend import Attend
+def l2norm(t):
+    return F.normalize(t, dim=-1, p=2)
+class CustomNorm(Module):
+    def __init__(self, dim, eps: float = 5.960464477539063e-08):
+        super().__init__()
+        self.scale = dim**0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+        self.eps = eps
+    def forward(self, x):
+        l2_norm = torch.linalg.norm(x, dim=-1, keepdim=True)
+        denom = torch.maximum(l2_norm, torch.full_like(l2_norm, self.eps))
+        normalized_x = x / denom
+        return normalized_x * self.scale * self.gamma
+class RotaryEmbedding(nn.Module):
+    def __init__(self, cos_emb, sin_emb):
+        super().__init__()
+        self.cos_emb = cos_emb
+        self.sin_emb = sin_emb
+    def rotate_half(self, x):
+        x = rearrange(x, "... (d r) -> ... d r", r=2)
+        x1, x2 = x.unbind(dim=-1)
+        x = torch.stack((-x2, x1), dim=-1)
+        return rearrange(x, "... d r -> ... (d r)")
+    def forward(self, x):
+        cos_b = self.cos_emb.unsqueeze(0).unsqueeze(0).to(x.device, x.dtype)
+        sin_b = self.sin_emb.unsqueeze(0).unsqueeze(0).to(x.device, x.dtype)
+        term1 = x * cos_b
+        term2 = self.rotate_half(x) * sin_b
+        sum = term1.to(torch.float32) + term2.to(torch.float32)
+        return sum.to(x.dtype)
+class FeedForward(Module):
+    def __init__(self, dim, mult=4, dropout=0.0):
+        super().__init__()
+        dim_inner = int(dim * mult)
+        self.net = nn.Sequential(
+            CustomNorm(dim),
+            nn.Linear(dim, dim_inner),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_inner, dim),
+            nn.Dropout(dropout),
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(Module):
+    def __init__(
+        self,
+        dim,
+        heads=8,
+        dim_head=64,
+        dropout=0.0,
+        shared_qkv_bias=None,
+        shared_out_bias=None,
+        rotary_embed: RotaryEmbedding | None = None,
+        flash=True,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head**-0.5
+        dim_inner = heads * dim_head
+        self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)  # type: ignore
+        self.norm = CustomNorm(dim)
+        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=(shared_qkv_bias is not None))
+        if shared_qkv_bias is not None:
+            self.to_qkv.bias = shared_qkv_bias
+        self.to_gates = nn.Linear(dim, heads)
+        self.to_out = nn.Sequential(
+            nn.Linear(dim_inner, dim, bias=(shared_out_bias is not None)),
+            nn.Dropout(dropout),
+        )
+        if shared_out_bias is not None:
+            self.to_out[0].bias = shared_out_bias
+    def forward(self, x):
+        x = self.norm(x)
+        qkv = self.to_qkv(x)
+        q, k, v = rearrange(qkv, "b n (qkv h d) -> qkv b h n d", qkv=3, h=self.heads)
+        if self.rotary_embed is not None:
+            q = self.rotary_embed(q)
+            k = self.rotary_embed(k)
+        out = self.attend(q, k, v)
+        gates = self.to_gates(x)
+        gate_act = gates.sigmoid()
+        out = out * rearrange(gate_act, "b n h -> b h n 1")
+        out = rearrange(out, "b h n d -> b n (h d)")
+        out = self.to_out(out)
+        return out
+class LinearAttention(Module):
+    @beartype
+    def __init__(
+        self,
+        *,
+        dim,
+        dim_head=32,
+        heads=8,
+        scale=8,
+        flash=True,
+        dropout=0.0,
+    ):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.norm = CustomNorm(dim)
+        self.to_qkv = nn.Sequential(
+            nn.Linear(dim, dim_inner * 3, bias=False),
+            Rearrange("b n (qkv h d) -> qkv b h d n", qkv=3, h=heads),
+        )
+        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
+        self.to_out = nn.Sequential(
+            Rearrange("b h d n -> b n (h d)"), nn.Linear(dim_inner, dim, bias=False)
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x)
+        q, k = map(l2norm, (q, k))
+        q = q * self.temperature.exp()
+        out = self.attend(q, k, v)
+        return self.to_out(out)
+class Transformer(Module):
+    def __init__(
+        self,
+        *,
+        dim,
+        depth,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        ff_mult=4,
+        norm_output=True,
+        rotary_embed: RotaryEmbedding | None = None,
+        flash_attn=True,
+        linear_attn=False,
+        shared_qkv_bias=None,
+        shared_out_bias=None,
+        **kwargs
+    ):
+        super().__init__()
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            attn: LinearAttention | Attention
+            if linear_attn:
+                attn = LinearAttention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    flash=flash_attn,
+                )
+            else:
+                attn = Attention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    shared_qkv_bias=shared_qkv_bias,
+                    shared_out_bias=shared_out_bias,
+                    rotary_embed=rotary_embed,
+                    flash=flash_attn,
+                )
+            self.layers.append(
+                ModuleList(
+                    [attn, FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)]
+                )
+            )
+        self.norm = CustomNorm(dim) if norm_output else nn.Identity()
+    def forward(self, x):
+        for attn, ff in self.layers:  # type: ignore
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+class BandSplit(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: tuple[int, ...]):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_features = ModuleList([])
+        for dim_in in dim_inputs:
+            net = nn.Sequential(CustomNorm(dim_in), nn.Linear(dim_in, dim))
+            self.to_features.append(net)
+    def forward(self, x):
+        x = x.split(self.dim_inputs, dim=-1)
+        outs = []
+        for split_input, to_feature in zip(x, self.to_features):
+            split_output = to_feature(split_input)
+            outs.append(split_output)
+        return torch.stack(outs, dim=-2)
+def MLP(
+    dim_in: int,
+    dim_out: int,
+    dim_hidden: int | None = None,
+    depth: int = 1,
+    activation=nn.Tanh,
+):
+    dim_hidden = dim_hidden or dim_in
+    net = []
+    dims = (dim_in, *((dim_hidden,) * (depth - 1)), dim_out)
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+        is_last = ind == (len(dims) - 2)
+        net.append(nn.Linear(layer_dim_in, layer_dim_out))
+        if is_last:
+            continue
+        net.append(activation())
+    return nn.Sequential(*net)
+class MaskEstimator(Module):
+    @beartype
+    def __init__(self, dim, dim_inputs: tuple[int, ...], depth, mlp_expansion_factor=4):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_freqs = ModuleList([])
+        dim_hidden = dim * mlp_expansion_factor
+        for dim_in in dim_inputs:
+            mlp = nn.Sequential(
+                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth), nn.GLU(dim=-1)
+            )
+            self.to_freqs.append(mlp)
+    def forward(self, x):
+        x = x.unbind(dim=-2)
+        outs = []
+        for band_features, mlp in zip(x, self.to_freqs):
+            freq_out = mlp(band_features)
+            outs.append(freq_out)
+        return torch.cat(outs, dim=-1)
+# fmt: off
+DEFAULT_FREQS_PER_BANDS = (
+    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+    4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
+    12, 12, 12, 12, 12, 12, 12, 12,
+    24, 24, 24, 24, 24, 24, 24, 24,
+    48, 48, 48, 48, 48, 48, 48, 48,
+    128, 129
+)
+# fmt: on
+class BSRoformer_SW(Module):
+    @beartype
+    def __init__(
+        self,
+        dim,
+        *,
+        depth,
+        stereo=False,
+        num_stems=1,
+        time_transformer_depth=2,
+        freq_transformer_depth=2,
+        linear_transformer_depth=0,
+        freqs_per_bands: tuple[int, ...] = DEFAULT_FREQS_PER_BANDS,
+        dim_head=64,
+        heads=8,
+        attn_dropout=0.0,
+        ff_dropout=0.0,
+        flash_attn=True,
+        stft_n_fft=2048,
+        stft_hop_length=512,
+        stft_win_length=2048,
+        stft_normalized=False,
+        stft_window_fn: Callable | None = None,
+        mask_estimator_depth=2,
+        multi_stft_resolution_loss_weight=1.0,
+        multi_stft_resolutions_window_sizes: tuple[int, ...] = (
+            4096,
+            2048,
+            1024,
+            512,
+            256,
+        ),
+        multi_stft_hop_size=147,
+        multi_stft_normalized=False,
+        multi_stft_window_fn: Callable = torch.hann_window,
+        mlp_expansion_factor=4,
+        use_torch_checkpoint=False,
+        skip_connection=False,
+        use_shared_bias=False,
+        chunk_size: int = 588800,
+        **kwargs
+    ):
+        super().__init__()
+        self.stereo = stereo
+        self.audio_channels = 2 if stereo else 1
+        self.num_stems = num_stems
+        self.use_torch_checkpoint = use_torch_checkpoint
+        self.skip_connection = skip_connection
+        self.layers = ModuleList([])
+        if use_shared_bias:
+            dim_inner = heads * dim_head
+            self.shared_qkv_bias = nn.Parameter(torch.ones(dim_inner * 3))
+            self.shared_out_bias = nn.Parameter(torch.ones(dim))
+        transformer_kwargs = dict(
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            flash_attn=flash_attn,
+            norm_output=False,
+            shared_qkv_bias=self.shared_qkv_bias,
+            shared_out_bias=self.shared_out_bias,
+        )
+        t_frames = chunk_size // stft_hop_length + 1
+        self.cos_emb_time = nn.Parameter(torch.zeros(t_frames, dim_head))
+        self.sin_emb_time = nn.Parameter(torch.zeros(t_frames, dim_head))
+        time_rotary_embed = RotaryEmbedding(
+            cos_emb=self.cos_emb_time, sin_emb=self.sin_emb_time
+        )
+        num_bands = len(freqs_per_bands)
+        self.cos_emb_freq = nn.Parameter(torch.zeros(num_bands, dim_head))
+        self.sin_emb_freq = nn.Parameter(torch.zeros(num_bands, dim_head))
+        freq_rotary_embed = RotaryEmbedding(
+            cos_emb=self.cos_emb_freq, sin_emb=self.sin_emb_freq
+        )
+        for _ in range(depth):
+            tran_modules = []
+            if linear_transformer_depth > 0:
+                tran_modules.append(
+                    Transformer(
+                        depth=linear_transformer_depth,
+                        linear_attn=True,
+                        **transformer_kwargs,
+                    )
+                )
+            tran_modules.append(
+                Transformer(
+                    depth=time_transformer_depth,
+                    rotary_embed=time_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            tran_modules.append(
+                Transformer(
+                    depth=freq_transformer_depth,
+                    rotary_embed=freq_rotary_embed,
+                    **transformer_kwargs,
+                )
+            )
+            self.layers.append(nn.ModuleList(tran_modules))
+        self.final_norm = CustomNorm(dim)
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized,
+        )
+        self.stft_window_fn = partial(
+            stft_window_fn or torch.hann_window, stft_win_length
+        )
+        freqs_per_bands_with_complex = tuple(
+            2 * f * self.audio_channels for f in freqs_per_bands
+        )
+        self.band_split = BandSplit(dim=dim, dim_inputs=freqs_per_bands_with_complex)
+        self.mask_estimators = nn.ModuleList([])
+        for _ in range(num_stems):
+            mask_estimator = MaskEstimator(
+                dim=dim,
+                dim_inputs=freqs_per_bands_with_complex,
+                depth=mask_estimator_depth,
+                mlp_expansion_factor=mlp_expansion_factor,
+            )
+            self.mask_estimators.append(mask_estimator)
+        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+        self.multi_stft_n_fft = stft_n_fft
+        self.multi_stft_window_fn = multi_stft_window_fn
+        self.multi_stft_kwargs = dict(
+            hop_length=multi_stft_hop_size, normalized=multi_stft_normalized
+        )
+    def forward(self, raw_audio, target=None, return_loss_breakdown=False):
+        device = raw_audio.device
+        x_is_mps = True if device.type == "mps" else False
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, "b t -> b 1 t")
+        channels = raw_audio.shape[1]
+        assert (not self.stereo and channels == 1) or (
+            self.stereo and channels == 2
+        ), "stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)"
+        raw_audio, batch_audio_channel_packed_shape = pack([raw_audio], "* t")
+        stft_window = self.stft_window_fn(device=device)
+        try:
+            stft_repr = torch.stft(
+                raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True
+            )
+        except Exception:
+            stft_repr = torch.stft(
+                raw_audio.cpu() if x_is_mps else raw_audio,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=True,
+            ).to(device)
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack(stft_repr, batch_audio_channel_packed_shape, "* f t c")[0]
+        stft_repr = rearrange(stft_repr, "b s f t c -> b (f s) t c")
+        x = rearrange(stft_repr, "b f t c -> b t (f c)")
+        if torch.isnan(x).any() or torch.isinf(x).any():
+            raise RuntimeError(
+                f"NaN/Inf in x after stft: {x.isnan().sum()} NaNs, {x.isinf().sum()} Infs"
+            )
+        if self.use_torch_checkpoint:
+            x = checkpoint(self.band_split, x, use_reentrant=False)
+        else:
+            x = self.band_split(x)
+        if torch.isnan(x).any() or torch.isinf(x).any():
+            raise RuntimeError(
+                f"NaN/Inf in x after band_split: {x.isnan().sum()} NaNs, {x.isinf().sum()} Infs"
+            )
+        store = [None] * len(self.layers)
+        for i, transformer_block in enumerate(self.layers):
+            if len(transformer_block) == 3:
+                linear_transformer, time_transformer, freq_transformer = (
+                    transformer_block
+                )
+                x, ft_ps = pack([x], "b * d")
+                if self.use_torch_checkpoint:
+                    x = checkpoint(linear_transformer, x, use_reentrant=False)
+                else:
+                    x = linear_transformer(x)
+                (x,) = unpack(x, ft_ps, "b * d")
+            else:
+                time_transformer, freq_transformer = transformer_block
+            if self.skip_connection:
+                for j in range(i):
+                    x = x + store[j]
+            x = rearrange(x, "b t f d -> b f t d")
+            x, ps = pack([x], "* t d")
+            if self.use_torch_checkpoint:
+                x = checkpoint(time_transformer, x, use_reentrant=False)
+            else:
+                x = time_transformer(x)
+            (x,) = unpack(x, ps, "* t d")
+            x = rearrange(x, "b f t d -> b t f d")
+            x, ps = pack([x], "* f d")
+            if self.use_torch_checkpoint:
+                x = checkpoint(freq_transformer, x, use_reentrant=False)
+            else:
+                x = freq_transformer(x)
+            (x,) = unpack(x, ps, "* f d")
+            if self.skip_connection:
+                store[i] = x
+        x = self.final_norm(x)
+        num_stems = len(self.mask_estimators)
+        if self.use_torch_checkpoint:
+            mask = torch.stack(
+                [checkpoint(fn, x, use_reentrant=False) for fn in self.mask_estimators],
+                dim=1,
+            )
+        else:
+            mask = torch.stack([fn(x) for fn in self.mask_estimators], dim=1)
+        mask = rearrange(mask, "b n t (f c) -> b n f t c", c=2)
+        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
+        stft_repr = torch.view_as_complex(stft_repr)
+        mask = torch.view_as_complex(mask)
+        stft_repr = stft_repr * mask
+        stft_repr = rearrange(
+            stft_repr, "b n (f s) t -> (b n s) f t", s=self.audio_channels
+        )
+        try:
+            recon_audio = torch.istft(
+                stft_repr,
+                **self.stft_kwargs,
+                window=stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1],
+            )
+        except Exception:
+            recon_audio = torch.istft(
+                stft_repr.cpu() if x_is_mps else stft_repr,
+                **self.stft_kwargs,
+                window=stft_window.cpu() if x_is_mps else stft_window,
+                return_complex=False,
+                length=raw_audio.shape[-1],
+            ).to(device)
+        recon_audio = rearrange(
+            recon_audio, "(b n s) t -> b n s t", s=self.audio_channels, n=num_stems
+        )
+        if num_stems == 1:
+            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
+        if target is None:
+            return recon_audio
+        if self.num_stems > 1:
+            assert target.ndim == 4 and target.shape[1] == self.num_stems
+        if target.ndim == 2:
+            target = rearrange(target, "... t -> ... 1 t")
+        target = target[..., : recon_audio.shape[-1]]
+        loss = F.l1_loss(recon_audio, target)
+        multi_stft_resolution_loss = 0.0
+        for window_size in self.multi_stft_resolutions_window_sizes:
+            res_stft_kwargs = dict(
+                n_fft=max(window_size, self.multi_stft_n_fft),
+                win_length=window_size,
+                return_complex=True,
+                window=self.multi_stft_window_fn(window_size, device=device),
+                **self.multi_stft_kwargs,
+            )
+            recon_Y = torch.stft(
+                rearrange(recon_audio, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            target_Y = torch.stft(
+                rearrange(target, "... s t -> (... s) t"), **res_stft_kwargs
+            )
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(
+                recon_Y, target_Y
+            )
+        weighted_multi_resolution_loss = (
+            multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        )
+        total_loss = loss + weighted_multi_resolution_loss
+        if not return_loss_breakdown:
+            return total_loss
+        return total_loss, (loss, multi_stft_resolution_loss)

models/bs_roformer/bs_roformer_unwa_inst_large_2.py CHANGED Viewed

@@ -6,10 +6,7 @@ from torch.nn import Module, ModuleList
 import torch.nn.functional as F
 from .attend import Attend
-try:
-    from .attend_sage import Attend as AttendSage
-except:
-    pass
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
@@ -85,7 +82,6 @@ class Attention(Module):
             dropout=0.,
             rotary_embed=None,
             flash=True,
-            sage_attention=False,
     ):
         super().__init__()
         self.heads = heads
@@ -94,10 +90,7 @@ class Attention(Module):
         self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
@@ -142,7 +135,6 @@ class LinearAttention(Module):
             scale=8,
             flash=False,
             dropout=0.,
-            sage_attention=False,
     ):
         super().__init__()
         dim_inner = dim_head * heads
@@ -155,18 +147,11 @@ class LinearAttention(Module):
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(
-                scale=scale,
-                dropout=dropout,
-                flash=flash
-            )
-        else:
-            self.attend = Attend(
-                scale=scale,
-                dropout=dropout,
-                flash=flash
-            )
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
@@ -203,7 +188,6 @@ class Transformer(Module):
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
-            sage_attention=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
@@ -216,7 +200,6 @@ class Transformer(Module):
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             else:
                 attn = Attention(
@@ -226,7 +209,6 @@ class Transformer(Module):
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             self.layers.append(ModuleList([
@@ -342,7 +324,6 @@ class MaskEstimator(Module):
             ff_dropout=0.,
             flash_attn=True,
             norm_output=False,
-            sage_attention=False,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)
@@ -445,7 +426,7 @@ class BSRoformer_2(Module):
             mlp_expansion_factor=4,
             use_torch_checkpoint=False,
             skip_connection=False,
-            sage_attention=False,
     ):
         super().__init__()
@@ -457,9 +438,6 @@ class BSRoformer_2(Module):
         self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
         transformer_kwargs = dict(
             dim=dim,
             heads=heads,
@@ -468,7 +446,6 @@ class BSRoformer_2(Module):
             ff_dropout=ff_dropout,
             flash_attn=flash_attn,
             norm_output=False,
-            sage_attention=sage_attention,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)

 import torch.nn.functional as F
 from .attend import Attend
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
             dropout=0.,
             rotary_embed=None,
             flash=True,
     ):
         super().__init__()
         self.heads = heads
         self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
             scale=8,
             flash=False,
             dropout=0.,
     ):
         super().__init__()
         dim_inner = dim_head * heads
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(
+            scale=scale,
+            dropout=dropout,
+            flash=flash
+        )
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
                 )
             else:
                 attn = Attention(
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
                 )
             self.layers.append(ModuleList([
             ff_dropout=0.,
             flash_attn=True,
             norm_output=False,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)
             mlp_expansion_factor=4,
             use_torch_checkpoint=False,
             skip_connection=False,
+            **kwargs
     ):
         super().__init__()
         self.layers = ModuleList([])
         transformer_kwargs = dict(
             dim=dim,
             heads=heads,
             ff_dropout=ff_dropout,
             flash_attn=flash_attn,
             norm_output=False,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)

models/bs_roformer/bs_siamese_roformer.py CHANGED Viewed

@@ -6,10 +6,6 @@ from torch.nn import Module, ModuleList
 import torch.nn.functional as F
 from .attend import Attend
-try:
-    from .attend_sage import Attend as AttendSage
-except:
-    pass
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
@@ -86,7 +82,6 @@ class Attention(Module):
             dropout=0.,
             rotary_embed=None,
             flash=True,
-            sage_attention=False,
     ):
         super().__init__()
         self.heads = heads
@@ -95,10 +90,7 @@ class Attention(Module):
         self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
@@ -143,7 +135,6 @@ class LinearAttention(Module):
             scale=8,
             flash=False,
             dropout=0.,
-            sage_attention=False,
     ):
         super().__init__()
         dim_inner = dim_head * heads
@@ -156,18 +147,11 @@ class LinearAttention(Module):
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(
-                scale=scale,
-                dropout=dropout,
-                flash=flash
-            )
-        else:
-            self.attend = Attend(
-                scale=scale,
-                dropout=dropout,
-                flash=flash
-            )
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
@@ -205,7 +189,6 @@ class SiameseTransformer(Module):
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
-            sage_attention=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
@@ -223,7 +206,6 @@ class SiameseTransformer(Module):
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             else:
                 attn = Attention(
@@ -233,7 +215,6 @@ class SiameseTransformer(Module):
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             self.layers.append(ModuleList([
@@ -415,7 +396,7 @@ class BSSiameseRoformer(Module):
             mlp_expansion_factor=4,
             use_torch_checkpoint=False,
             skip_connection=False,
-            sage_attention=False,
     ):
         super().__init__()
@@ -427,9 +408,6 @@ class BSSiameseRoformer(Module):
         self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
         transformer_kwargs = dict(
             dim=dim,
             heads=heads,
@@ -438,7 +416,6 @@ class BSSiameseRoformer(Module):
             ff_dropout=ff_dropout,
             flash_attn=flash_attn,
             norm_output=False,
-            sage_attention=sage_attention,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)

 import torch.nn.functional as F
 from .attend import Attend
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
             dropout=0.,
             rotary_embed=None,
             flash=True,
     ):
         super().__init__()
         self.heads = heads
         self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
             scale=8,
             flash=False,
             dropout=0.,
     ):
         super().__init__()
         dim_inner = dim_head * heads
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(
+            scale=scale,
+            dropout=dropout,
+            flash=flash
+        )
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
             rotary_embed=None,
             flash_attn=True,
             linear_attn=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
                 )
             else:
                 attn = Attention(
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
                 )
             self.layers.append(ModuleList([
             mlp_expansion_factor=4,
             use_torch_checkpoint=False,
             skip_connection=False,
+            **kwargs
     ):
         super().__init__()
         self.layers = ModuleList([])
         transformer_kwargs = dict(
             dim=dim,
             heads=heads,
             ff_dropout=ff_dropout,
             flash_attn=flash_attn,
             norm_output=False,
         )
         time_rotary_embed = RotaryEmbedding(dim=dim_head)

models/bs_roformer/fno1d.py CHANGED Viewed

The diff for this file is too large to render. See raw diff

models/bs_roformer/mel_band_conformer.py CHANGED Viewed

@@ -6,10 +6,6 @@ from torch.nn import Module, ModuleList
 import torch.nn.functional as F
 from .attend import Attend
-try:
-    from .attend_sage import Attend as AttendSage
-except:
-    pass
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
@@ -97,7 +93,6 @@ class Attention(Module):
         dropout=0.,
         rotary_embed=None,
         flash=True,
-        sage_attention=False,
     ):
         super().__init__()
         self.heads = heads
@@ -106,10 +101,7 @@ class Attention(Module):
         self.rotary_embed = rotary_embed
-        if sage_attention:
-            self.attend = AttendSage(flash=flash, dropout=dropout)
-        else:
-            self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
@@ -153,7 +145,6 @@ class LinearAttention(Module):
         scale=8,
         flash=True,
         dropout=0.,
-        sage_attention=False
     ):
         super().__init__()
         dim_inner = dim_head * heads
@@ -166,10 +157,7 @@ class LinearAttention(Module):
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        if sage_attention:
-            self.attend = AttendSage(scale=scale, dropout=dropout, flash=flash)
-        else:
-            self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
@@ -202,7 +190,6 @@ class Transformer(Module):
         rotary_embed=None,
         flash_attn=True,
         linear_attn=False,
-        sage_attention=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
@@ -215,7 +202,6 @@ class Transformer(Module):
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             else:
                 attn = Attention(
@@ -225,7 +211,6 @@ class Transformer(Module):
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
-                    sage_attention=sage_attention
                 )
             self.layers.append(ModuleList([
@@ -290,7 +275,6 @@ class ConformerBlock(nn.Module):
         conv_kernel_size=31,
         rotary_embed=None,
         flash_attn=True,
-        sage_attention=False
     ):
         super().__init__()
         self.ff1 = MacaronFF(dim=dim, mult=ff_mult, dropout=ff_dropout)
@@ -301,7 +285,6 @@ class ConformerBlock(nn.Module):
             dropout=attn_dropout,
             rotary_embed=rotary_embed,
             flash=flash_attn,
-            sage_attention=sage_attention
         )
         self.conv = ConformerConvModule(
             dim=dim,
@@ -333,7 +316,6 @@ class Conformer(Module):
         ff_mult=4,
         rotary_embed=None,
         flash_attn=True,
-        sage_attention=False,
         conv_expansion_factor=2,
         conv_kernel_size=31,
         norm_output=True
@@ -351,7 +333,6 @@ class Conformer(Module):
                 conv_kernel_size=conv_kernel_size,
                 rotary_embed=rotary_embed,
                 flash_attn=flash_attn,
-                sage_attention=sage_attention
             ) for _ in range(depth)
         ])
         self.norm = RMSNorm(dim) if norm_output else nn.Identity()
@@ -466,7 +447,6 @@ class MelBandConformer(Module):
         mlp_expansion_factor=4,
         use_torch_checkpoint=False,
         skip_connection=False,
-        sage_attention=False,
         # conformer-specific
         ff_mult=4,
         conv_expansion_factor=2,
@@ -482,9 +462,6 @@ class MelBandConformer(Module):
         self.layers = ModuleList([])
-        if sage_attention:
-            print("Use Sage Attention")
         transformer_kwargs = dict(
             dim = dim,
             heads = heads,
@@ -492,7 +469,6 @@ class MelBandConformer(Module):
             attn_dropout = attn_dropout,
             ff_dropout = ff_dropout,
             flash_attn = flash_attn,
-            sage_attention = sage_attention,
             norm_output = False
         )

 import torch.nn.functional as F
 from .attend import Attend
 from torch.utils.checkpoint import checkpoint
 from beartype.typing import Tuple, Optional, List, Callable
         dropout=0.,
         rotary_embed=None,
         flash=True,
     ):
         super().__init__()
         self.heads = heads
         self.rotary_embed = rotary_embed
+        self.attend = Attend(flash=flash, dropout=dropout)
         self.norm = RMSNorm(dim)
         self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
         scale=8,
         flash=True,
         dropout=0.,
     ):
         super().__init__()
         dim_inner = dim_head * heads
         self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(scale=scale, dropout=dropout, flash=flash)
         self.to_out = nn.Sequential(
             Rearrange('b h d n -> b n (h d)'),
         rotary_embed=None,
         flash_attn=True,
         linear_attn=False,
     ):
         super().__init__()
         self.layers = ModuleList([])
                     heads=heads,
                     dropout=attn_dropout,
                     flash=flash_attn,
                 )
             else:
                 attn = Attention(
                     dropout=attn_dropout,
                     rotary_embed=rotary_embed,
                     flash=flash_attn,
                 )
             self.layers.append(ModuleList([
         conv_kernel_size=31,
         rotary_embed=None,
         flash_attn=True,
     ):
         super().__init__()
         self.ff1 = MacaronFF(dim=dim, mult=ff_mult, dropout=ff_dropout)
             dropout=attn_dropout,
             rotary_embed=rotary_embed,
             flash=flash_attn,
         )
         self.conv = ConformerConvModule(
             dim=dim,
         ff_mult=4,
         rotary_embed=None,
         flash_attn=True,
         conv_expansion_factor=2,
         conv_kernel_size=31,
         norm_output=True
                 conv_kernel_size=conv_kernel_size,
                 rotary_embed=rotary_embed,
                 flash_attn=flash_attn,
             ) for _ in range(depth)
         ])
         self.norm = RMSNorm(dim) if norm_output else nn.Identity()
         mlp_expansion_factor=4,
         use_torch_checkpoint=False,
         skip_connection=False,
         # conformer-specific
         ff_mult=4,
         conv_expansion_factor=2,
         self.layers = ModuleList([])
         transformer_kwargs = dict(
             dim = dim,
             heads = heads,
             attn_dropout = attn_dropout,
             ff_dropout = ff_dropout,
             flash_attn = flash_attn,
             norm_output = False
         )

models/bs_roformer/mel_band_roformer.py CHANGED Viewed

@@ -1,748 +1,749 @@
-from functools import partial
-import torch
-from torch import nn, einsum, tensor, Tensor
-from torch.nn import Module, ModuleList
-import torch.nn.functional as F
-from .attend import Attend
-from torch.utils.checkpoint import checkpoint
-from beartype.typing import Tuple, Optional, List, Callable
-from beartype import beartype
-from rotary_embedding_torch import RotaryEmbedding
-from einops import rearrange, pack, unpack, reduce, repeat
-from einops.layers.torch import Rearrange
-from librosa import filters
-try:
-    from .pope.attention import flash_attn_with_pope
-    from .pope.pope import PoPE
-    _HAS_POPE = True
-except Exception:
-    PoPE = None
-    flash_attn_with_pope = None
-    _HAS_POPE = False
-# helper functions
-def exists(val):
-    return val is not None
-def default(v, d):
-    return v if exists(v) else d
-def pack_one(t, pattern):
-    return pack([t], pattern)
-def unpack_one(t, ps, pattern):
-    return unpack(t, ps, pattern)[0]
-def pad_at_dim(t, pad, dim=-1, value=0.):
-    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
-    zeros = ((0, 0) * dims_from_right)
-    return F.pad(t, (*zeros, *pad), value=value)
-def l2norm(t):
-    return F.normalize(t, dim=-1, p=2)
-# norm
-class RMSNorm(Module):
-    def __init__(self, dim):
-        super().__init__()
-        self.scale = dim ** 0.5
-        self.gamma = nn.Parameter(torch.ones(dim))
-    def forward(self, x):
-        return F.normalize(x, dim=-1) * self.scale * self.gamma
-# attention
-class FeedForward(Module):
-    def __init__(
-            self,
-            dim,
-            mult=4,
-            dropout=0.
-    ):
-        super().__init__()
-        dim_inner = int(dim * mult)
-        self.net = nn.Sequential(
-            RMSNorm(dim),
-            nn.Linear(dim, dim_inner),
-            nn.GELU(),
-            nn.Dropout(dropout),
-            nn.Linear(dim_inner, dim),
-            nn.Dropout(dropout)
-        )
-    def forward(self, x):
-        return self.net(x)
-class Attention(Module):
-    def __init__(
-            self,
-            dim,
-            heads=8,
-            dim_head=64,
-            dropout=0.,
-            rotary_embed=None,
-            flash=True,
-            pope_embed=None,
-    ):
-        super().__init__()
-        self.heads = heads
-        self.scale = dim_head ** -0.5
-        dim_inner = heads * dim_head
-        self.rotary_embed = rotary_embed
-        self.pope_embed = pope_embed
-        assert not (self.rotary_embed is not None and self.pope_embed is not None), \
-            "cannot have both rotary and pope embeddings"
-        self.attend = Attend(flash=flash, dropout=dropout)
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
-        self.to_gates = nn.Linear(dim, heads)
-        self.to_out = nn.Sequential(
-            nn.Linear(dim_inner, dim, bias=False),
-            nn.Dropout(dropout)
-        )
-    def forward(self, x):
-        x = self.norm(x)
-        q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads)
-        if exists(self.pope_embed):
-            assert _HAS_POPE, "PoPE requested but PoPE_pytorch is not installed"
-            out = flash_attn_with_pope(
-                q, k, v,
-                pos_emb=self.pope_embed(q.shape[-2]),
-                softmax_scale=self.scale
-            )
-        elif exists(self.rotary_embed):
-            q = self.rotary_embed.rotate_queries_or_keys(q)
-            k = self.rotary_embed.rotate_queries_or_keys(k)
-            out = self.attend(q, k, v)
-        else:
-            out = self.attend(q, k, v)
-        gates = self.to_gates(x)
-        out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid()
-        out = rearrange(out, 'b h n d -> b n (h d)')
-        return self.to_out(out)
-class LinearAttention(Module):
-    """
-    this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al.
-    """
-    @beartype
-    def __init__(
-            self,
-            *,
-            dim,
-            dim_head=32,
-            heads=8,
-            scale=8,
-            flash=False,
-            dropout=0.
-    ):
-        super().__init__()
-        dim_inner = dim_head * heads
-        self.norm = RMSNorm(dim)
-        self.to_qkv = nn.Sequential(
-            nn.Linear(dim, dim_inner * 3, bias=False),
-            Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads)
-        )
-        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
-        self.attend = Attend(
-            scale=scale,
-            dropout=dropout,
-            flash=flash
-        )
-        self.to_out = nn.Sequential(
-            Rearrange('b h d n -> b n (h d)'),
-            nn.Linear(dim_inner, dim, bias=False)
-        )
-    def forward(
-            self,
-            x
-    ):
-        x = self.norm(x)
-        q, k, v = self.to_qkv(x)
-        q, k = map(l2norm, (q, k))
-        q = q * self.temperature.exp()
-        out = self.attend(q, k, v)
-        return self.to_out(out)
-class Transformer(Module):
-    def __init__(
-            self,
-            *,
-            dim,
-            depth,
-            dim_head=64,
-            heads=8,
-            attn_dropout=0.,
-            ff_dropout=0.,
-            ff_mult=4,
-            norm_output=True,
-            rotary_embed=None,
-            pope_embed=None,
-            flash_attn=True,
-            linear_attn=False,
-    ):
-        super().__init__()
-        self.layers = ModuleList([])
-        for _ in range(depth):
-            if linear_attn:
-                attn = LinearAttention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    flash=flash_attn
-                )
-            else:
-                attn = Attention(
-                    dim=dim,
-                    dim_head=dim_head,
-                    heads=heads,
-                    dropout=attn_dropout,
-                    rotary_embed=rotary_embed,
-                    pope_embed=pope_embed,
-                    flash=flash_attn
-                )
-            self.layers.append(ModuleList([
-                attn,
-                FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
-            ]))
-        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
-    def forward(self, x):
-        for attn, ff in self.layers:
-            x = attn(x) + x
-            x = ff(x) + x
-        return self.norm(x)
-# bandsplit module
-class BandSplit(Module):
-    @beartype
-    def __init__(
-            self,
-            dim,
-            dim_inputs: Tuple[int, ...]
-    ):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_features = ModuleList([])
-        for dim_in in dim_inputs:
-            net = nn.Sequential(
-                RMSNorm(dim_in),
-                nn.Linear(dim_in, dim)
-            )
-            self.to_features.append(net)
-    def forward(self, x):
-        x = x.split(self.dim_inputs, dim=-1)
-        outs = []
-        for split_input, to_feature in zip(x, self.to_features):
-            split_output = to_feature(split_input)
-            outs.append(split_output)
-        return torch.stack(outs, dim=-2)
-def MLP(
-        dim_in,
-        dim_out,
-        dim_hidden=None,
-        depth=1,
-        activation=nn.Tanh
-):
-    dim_hidden = default(dim_hidden, dim_in)
-    net = []
-    dims = (dim_in, *((dim_hidden,) * depth), dim_out)
-    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
-        is_last = ind == (len(dims) - 2)
-        net.append(nn.Linear(layer_dim_in, layer_dim_out))
-        if is_last:
-            continue
-        net.append(activation())
-    return nn.Sequential(*net)
-class MaskEstimator(Module):
-    @beartype
-    def __init__(
-            self,
-            dim,
-            dim_inputs: Tuple[int, ...],
-            depth,
-            mlp_expansion_factor=4
-    ):
-        super().__init__()
-        self.dim_inputs = dim_inputs
-        self.to_freqs = ModuleList([])
-        dim_hidden = dim * mlp_expansion_factor
-        for dim_in in dim_inputs:
-            net = []
-            mlp = nn.Sequential(
-                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth),
-                nn.GLU(dim=-1)
-            )
-            self.to_freqs.append(mlp)
-    def forward(self, x):
-        x = x.unbind(dim=-2)
-        outs = []
-        for band_features, mlp in zip(x, self.to_freqs):
-            freq_out = mlp(band_features)
-            outs.append(freq_out)
-        return torch.cat(outs, dim=-1)
-# main class
-class MelBandRoformer(Module):
-    @beartype
-    def __init__(
-            self,
-            dim,
-            *,
-            depth,
-            stereo=False,
-            num_stems=1,
-            time_transformer_depth=2,
-            freq_transformer_depth=2,
-            linear_transformer_depth=0,
-            num_bands=60,
-            dim_head=64,
-            heads=8,
-            attn_dropout=0.1,
-            ff_dropout=0.1,
-            flash_attn=True,
-            dim_freqs_in=1025,
-            sample_rate=44100,  # needed for mel filter bank from librosa
-            stft_n_fft=2048,
-            stft_hop_length=512,
-            # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
-            stft_win_length=2048,
-            stft_normalized=False,
-            stft_window_fn: Optional[Callable] = None,
-            zero_dc = True,
-            mask_estimator_depth=1,
-            multi_stft_resolution_loss_weight=1.,
-            multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
-            multi_stft_hop_size=147,
-            multi_stft_normalized=False,
-            multi_stft_window_fn: Callable = torch.hann_window,
-            match_input_audio_length=False,  # if True, pad output tensor to match length of input tensor
-            mlp_expansion_factor=4,
-            use_torch_checkpoint=False,
-            skip_connection=False,
-            use_pope: bool = False,
-    ):
-        super().__init__()
-        self.stereo = stereo
-        self.audio_channels = 2 if stereo else 1
-        self.num_stems = num_stems
-        self.use_torch_checkpoint = use_torch_checkpoint
-        self.skip_connection = skip_connection
-        self.layers = ModuleList([])
-        transformer_kwargs = dict(
-            dim=dim,
-            heads=heads,
-            dim_head=dim_head,
-            attn_dropout=attn_dropout,
-            ff_dropout=ff_dropout,
-            flash_attn=flash_attn,
-        )
-        if use_pope:
-            assert _HAS_POPE, "PoPE requested but PoPE_pytorch is not installed"
-            time_pope_embed = PoPE(dim=dim_head, heads=heads)
-            freq_pope_embed = PoPE(dim=dim_head, heads=heads)
-            time_rotary_embed = None
-            freq_rotary_embed = None
-        else:
-            time_rotary_embed = RotaryEmbedding(dim=dim_head)
-            freq_rotary_embed = RotaryEmbedding(dim=dim_head)
-            time_pope_embed = freq_pope_embed = None
-        for _ in range(depth):
-            tran_modules = []
-            if linear_transformer_depth > 0:
-                tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, **transformer_kwargs))
-            tran_modules.append(
-                Transformer(
-                    depth=time_transformer_depth,
-                    rotary_embed=time_rotary_embed,
-                    pope_embed=time_pope_embed,
-                    **transformer_kwargs
-                )
-            )
-            tran_modules.append(
-                Transformer(
-                    depth=freq_transformer_depth,
-                    rotary_embed=freq_rotary_embed,
-                    pope_embed=freq_pope_embed,
-                    **transformer_kwargs
-                )
-            )
-            self.layers.append(nn.ModuleList(tran_modules))
-        self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
-        self.stft_kwargs = dict(
-            n_fft=stft_n_fft,
-            hop_length=stft_hop_length,
-            win_length=stft_win_length,
-            normalized=stft_normalized
-        )
-        freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_n_fft), return_complex=True).shape[1]
-        # create mel filter bank
-        # with librosa.filters.mel as in section 2 of paper
-        mel_filter_bank_numpy = filters.mel(sr=sample_rate, n_fft=stft_n_fft, n_mels=num_bands)
-        mel_filter_bank = torch.from_numpy(mel_filter_bank_numpy)
-        # for some reason, it doesn't include the first freq? just force a value for now
-        mel_filter_bank[0][0] = 1.
-        # In some systems/envs we get 0.0 instead of ~1.9e-18 in the last position,
-        # so let's force a positive value
-        mel_filter_bank[-1, -1] = 1.
-        # binary as in paper (then estimated masks are averaged for overlapping regions)
-        freqs_per_band = mel_filter_bank > 0
-        assert freqs_per_band.any(dim=0).all(), 'all frequencies need to be covered by all bands for now'
-        repeated_freq_indices = repeat(torch.arange(freqs), 'f -> b f', b=num_bands)
-        freq_indices = repeated_freq_indices[freqs_per_band]
-        if stereo:
-            freq_indices = repeat(freq_indices, 'f -> f s', s=2)
-            freq_indices = freq_indices * 2 + torch.arange(2)
-            freq_indices = rearrange(freq_indices, 'f s -> (f s)')
-        self.register_buffer('freq_indices', freq_indices, persistent=False)
-        self.register_buffer('freqs_per_band', freqs_per_band, persistent=False)
-        num_freqs_per_band = reduce(freqs_per_band, 'b f -> b', 'sum')
-        num_bands_per_freq = reduce(freqs_per_band, 'b f -> f', 'sum')
-        self.register_buffer('num_freqs_per_band', num_freqs_per_band, persistent=False)
-        self.register_buffer('num_bands_per_freq', num_bands_per_freq, persistent=False)
-        # band split and mask estimator
-        freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in num_freqs_per_band.tolist())
-        self.band_split = BandSplit(
-            dim=dim,
-            dim_inputs=freqs_per_bands_with_complex
-        )
-        self.mask_estimators = nn.ModuleList([])
-        for _ in range(num_stems):
-            mask_estimator = MaskEstimator(
-                dim=dim,
-                dim_inputs=freqs_per_bands_with_complex,
-                depth=mask_estimator_depth,
-                mlp_expansion_factor=mlp_expansion_factor,
-            )
-            self.mask_estimators.append(mask_estimator)
-        # whether to zero out dc
-        self.zero_dc = zero_dc
-        # for the multi-resolution stft loss
-        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
-        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
-        self.multi_stft_n_fft = stft_n_fft
-        self.multi_stft_window_fn = multi_stft_window_fn
-        self.multi_stft_kwargs = dict(
-            hop_length=multi_stft_hop_size,
-            normalized=multi_stft_normalized
-        )
-        self.match_input_audio_length = match_input_audio_length
-    def forward(
-            self,
-            raw_audio,
-            target=None,
-            active_stem_ids=None,
-            return_loss_breakdown=False
-    ):
-        """
-        einops
-        b - batch
-        f - freq
-        t - time
-        s - audio channel (1 for mono, 2 for stereo)
-        n - number of 'stems'
-        c - complex (2)
-        d - feature dimension
-        """
-        device = raw_audio.device
-        if raw_audio.ndim == 2:
-            raw_audio = rearrange(raw_audio, 'b t -> b 1 t')
-        batch, channels, raw_audio_length = raw_audio.shape
-        istft_length = raw_audio_length if self.match_input_audio_length else None
-        assert (not self.stereo and channels == 1) or (
-                    self.stereo and channels == 2), 'stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)'
-        # to stft
-        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t')
-        stft_window = self.stft_window_fn(device=device)
-        stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True)
-        stft_repr = torch.view_as_real(stft_repr)
-        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c')
-        # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
-        stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c')
-        # index out all frequencies for all frequency ranges across bands ascending in one go
-        batch_arange = torch.arange(batch, device=device)[..., None]
-        # account for stereo
-        x = stft_repr[batch_arange, self.freq_indices]
-        # fold the complex (real and imag) into the frequencies dimension
-        x = rearrange(x, 'b f t c -> b t (f c)')
-        if self.use_torch_checkpoint:
-            x = checkpoint(self.band_split, x, use_reentrant=False)
-        else:
-            x = self.band_split(x)
-        # axial / hierarchical attention
-        store = [None] * len(self.layers)
-        for i, transformer_block in enumerate(self.layers):
-            if len(transformer_block) == 3:
-                linear_transformer, time_transformer, freq_transformer = transformer_block
-                x, ft_ps = pack([x], 'b * d')
-                if self.use_torch_checkpoint:
-                    x = checkpoint(linear_transformer, x, use_reentrant=False)
-                else:
-                    x = linear_transformer(x)
-                x, = unpack(x, ft_ps, 'b * d')
-            else:
-                time_transformer, freq_transformer = transformer_block
-            if self.skip_connection:
-                # Sum all previous
-                for j in range(i):
-                    x = x + store[j]
-            x = rearrange(x, 'b t f d -> b f t d')
-            x, ps = pack([x], '* t d')
-            if self.use_torch_checkpoint:
-                x = checkpoint(time_transformer, x, use_reentrant=False)
-            else:
-                x = time_transformer(x)
-            x, = unpack(x, ps, '* t d')
-            x = rearrange(x, 'b f t d -> b t f d')
-            x, ps = pack([x], '* f d')
-            if self.use_torch_checkpoint:
-                x = checkpoint(freq_transformer, x, use_reentrant=False)
-            else:
-                x = freq_transformer(x)
-            x, = unpack(x, ps, '* f d')
-            if self.skip_connection:
-                store[i] = x
-        if active_stem_ids is None:
-            heads = self.mask_estimators
-            stem_ids = list(range(len(self.mask_estimators)))
-        else:
-            heads = [self.mask_estimators[i] for i in active_stem_ids]
-            stem_ids = active_stem_ids
-        num_stems = len(heads)
-        if self.use_torch_checkpoint:
-            masks = torch.stack([checkpoint(fn, x, use_reentrant=False) for fn in heads], dim=1)
-        else:
-            masks = torch.stack([fn(x) for fn in heads], dim=1)
-        masks = rearrange(masks, 'b n t (f c) -> b n f t c', c=2)
-        # modulate frequency representation
-        stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c')
-        # complex number multiplication
-        stft_repr = torch.view_as_complex(stft_repr)
-        masks = torch.view_as_complex(masks)
-        masks = masks.type(stft_repr.dtype)
-        # need to average the estimated mask for the overlapped frequencies
-        scatter_indices = repeat(self.freq_indices, 'f -> b n f t', b=batch, n=num_stems, t=stft_repr.shape[-1])
-        stft_repr_expanded_stems = repeat(stft_repr, 'b 1 ... -> b n ...', n=num_stems)
-        masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_(2, scatter_indices, masks)
-        denom = repeat(self.num_bands_per_freq, 'f -> (f r) 1', r=channels)
-        masks_averaged = masks_summed / denom.clamp(min=1e-8)
-        # modulate stft repr with estimated mask
-        stft_repr = stft_repr * masks_averaged
-        # istft
-        stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels)
-        if self.zero_dc:
-            # whether to dc filter
-            stft_repr = stft_repr.index_fill(1, tensor(0, device = device), 0.)
-        recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False,
-                                  length=istft_length)
-        recon_audio = rearrange(recon_audio, '(b n s) t -> b n s t', b=batch, s=self.audio_channels, n=num_stems)
-        if num_stems == 1:
-            recon_audio = rearrange(recon_audio, 'b 1 s t -> b s t')
-        # if a target is passed in, calculate loss for learning
-        if not exists(target):
-            return recon_audio
-        if self.num_stems > 1:
-            assert target.ndim == 4 and target.shape[1] == self.num_stems
-        if target.ndim == 2:
-            target = rearrange(target, '... t -> ... 1 t')
-        target = target[..., :recon_audio.shape[-1]]  # protect against lost length on istft
-        target_sel = target[:, stem_ids]
-        loss = F.l1_loss(recon_audio, target_sel)
-        multi_stft_resolution_loss = 0.
-        for window_size in self.multi_stft_resolutions_window_sizes:
-            res_stft_kwargs = dict(
-                n_fft=max(window_size, self.multi_stft_n_fft),  # not sure what n_fft is across multi resolution stft
-                win_length=window_size,
-                return_complex=True,
-                window=self.multi_stft_window_fn(window_size, device=device),
-                **self.multi_stft_kwargs,
-            )
-            recon_Y = torch.stft(
-                rearrange(recon_audio, 'b n s t -> (b n s) t'),
-                **res_stft_kwargs
-            )
-            target_Y = torch.stft(
-                rearrange(target_sel, 'b n s t -> (b n s) t'),
-                **res_stft_kwargs
-            )
-            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
-        weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
-        total_loss = loss + weighted_multi_resolution_loss
-        if not return_loss_breakdown:
-            return total_loss
         return total_loss, (loss, multi_stft_resolution_loss)

+from functools import partial
+import torch
+from torch import nn, einsum, tensor, Tensor
+from torch.nn import Module, ModuleList
+import torch.nn.functional as F
+from .attend import Attend
+from torch.utils.checkpoint import checkpoint
+from beartype.typing import Tuple, Optional, List, Callable
+from beartype import beartype
+from rotary_embedding_torch import RotaryEmbedding
+from einops import rearrange, pack, unpack, reduce, repeat
+from einops.layers.torch import Rearrange
+from librosa import filters
+try:
+    from .pope.attention import flash_attn_with_pope
+    from .pope.pope import PoPE
+    _HAS_POPE = True
+except Exception:
+    PoPE = None
+    flash_attn_with_pope = None
+    _HAS_POPE = False
+# helper functions
+def exists(val):
+    return val is not None
+def default(v, d):
+    return v if exists(v) else d
+def pack_one(t, pattern):
+    return pack([t], pattern)
+def unpack_one(t, ps, pattern):
+    return unpack(t, ps, pattern)[0]
+def pad_at_dim(t, pad, dim=-1, value=0.):
+    dims_from_right = (- dim - 1) if dim < 0 else (t.ndim - dim - 1)
+    zeros = ((0, 0) * dims_from_right)
+    return F.pad(t, (*zeros, *pad), value=value)
+def l2norm(t):
+    return F.normalize(t, dim=-1, p=2)
+# norm
+class RMSNorm(Module):
+    def __init__(self, dim):
+        super().__init__()
+        self.scale = dim ** 0.5
+        self.gamma = nn.Parameter(torch.ones(dim))
+    def forward(self, x):
+        return F.normalize(x, dim=-1) * self.scale * self.gamma
+# attention
+class FeedForward(Module):
+    def __init__(
+            self,
+            dim,
+            mult=4,
+            dropout=0.
+    ):
+        super().__init__()
+        dim_inner = int(dim * mult)
+        self.net = nn.Sequential(
+            RMSNorm(dim),
+            nn.Linear(dim, dim_inner),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(dim_inner, dim),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        return self.net(x)
+class Attention(Module):
+    def __init__(
+            self,
+            dim,
+            heads=8,
+            dim_head=64,
+            dropout=0.,
+            rotary_embed=None,
+            flash=True,
+            pope_embed=None,
+    ):
+        super().__init__()
+        self.heads = heads
+        self.scale = dim_head ** -0.5
+        dim_inner = heads * dim_head
+        self.rotary_embed = rotary_embed
+        self.pope_embed = pope_embed
+        assert not (self.rotary_embed is not None and self.pope_embed is not None), \
+            "cannot have both rotary and pope embeddings"
+        self.attend = Attend(flash=flash, dropout=dropout)
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Linear(dim, dim_inner * 3, bias=False)
+        self.to_gates = nn.Linear(dim, heads)
+        self.to_out = nn.Sequential(
+            nn.Linear(dim_inner, dim, bias=False),
+            nn.Dropout(dropout)
+        )
+    def forward(self, x):
+        x = self.norm(x)
+        q, k, v = rearrange(self.to_qkv(x), 'b n (qkv h d) -> qkv b h n d', qkv=3, h=self.heads)
+        if exists(self.pope_embed):
+            assert _HAS_POPE, "PoPE requested but PoPE_pytorch is not installed"
+            out = flash_attn_with_pope(
+                q, k, v,
+                pos_emb=self.pope_embed(q.shape[-2]),
+                softmax_scale=self.scale
+            )
+        elif exists(self.rotary_embed):
+            q = self.rotary_embed.rotate_queries_or_keys(q)
+            k = self.rotary_embed.rotate_queries_or_keys(k)
+            out = self.attend(q, k, v)
+        else:
+            out = self.attend(q, k, v)
+        gates = self.to_gates(x)
+        out = out * rearrange(gates, 'b n h -> b h n 1').sigmoid()
+        out = rearrange(out, 'b h n d -> b n (h d)')
+        return self.to_out(out)
+class LinearAttention(Module):
+    """
+    this flavor of linear attention proposed in https://arxiv.org/abs/2106.09681 by El-Nouby et al.
+    """
+    @beartype
+    def __init__(
+            self,
+            *,
+            dim,
+            dim_head=32,
+            heads=8,
+            scale=8,
+            flash=False,
+            dropout=0.
+    ):
+        super().__init__()
+        dim_inner = dim_head * heads
+        self.norm = RMSNorm(dim)
+        self.to_qkv = nn.Sequential(
+            nn.Linear(dim, dim_inner * 3, bias=False),
+            Rearrange('b n (qkv h d) -> qkv b h d n', qkv=3, h=heads)
+        )
+        self.temperature = nn.Parameter(torch.ones(heads, 1, 1))
+        self.attend = Attend(
+            scale=scale,
+            dropout=dropout,
+            flash=flash
+        )
+        self.to_out = nn.Sequential(
+            Rearrange('b h d n -> b n (h d)'),
+            nn.Linear(dim_inner, dim, bias=False)
+        )
+    def forward(
+            self,
+            x
+    ):
+        x = self.norm(x)
+        q, k, v = self.to_qkv(x)
+        q, k = map(l2norm, (q, k))
+        q = q * self.temperature.exp()
+        out = self.attend(q, k, v)
+        return self.to_out(out)
+class Transformer(Module):
+    def __init__(
+            self,
+            *,
+            dim,
+            depth,
+            dim_head=64,
+            heads=8,
+            attn_dropout=0.,
+            ff_dropout=0.,
+            ff_mult=4,
+            norm_output=True,
+            rotary_embed=None,
+            pope_embed=None,
+            flash_attn=True,
+            linear_attn=False,
+    ):
+        super().__init__()
+        self.layers = ModuleList([])
+        for _ in range(depth):
+            if linear_attn:
+                attn = LinearAttention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    flash=flash_attn
+                )
+            else:
+                attn = Attention(
+                    dim=dim,
+                    dim_head=dim_head,
+                    heads=heads,
+                    dropout=attn_dropout,
+                    rotary_embed=rotary_embed,
+                    pope_embed=pope_embed,
+                    flash=flash_attn
+                )
+            self.layers.append(ModuleList([
+                attn,
+                FeedForward(dim=dim, mult=ff_mult, dropout=ff_dropout)
+            ]))
+        self.norm = RMSNorm(dim) if norm_output else nn.Identity()
+    def forward(self, x):
+        for attn, ff in self.layers:
+            x = attn(x) + x
+            x = ff(x) + x
+        return self.norm(x)
+# bandsplit module
+class BandSplit(Module):
+    @beartype
+    def __init__(
+            self,
+            dim,
+            dim_inputs: Tuple[int, ...]
+    ):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_features = ModuleList([])
+        for dim_in in dim_inputs:
+            net = nn.Sequential(
+                RMSNorm(dim_in),
+                nn.Linear(dim_in, dim)
+            )
+            self.to_features.append(net)
+    def forward(self, x):
+        x = x.split(self.dim_inputs, dim=-1)
+        outs = []
+        for split_input, to_feature in zip(x, self.to_features):
+            split_output = to_feature(split_input)
+            outs.append(split_output)
+        return torch.stack(outs, dim=-2)
+def MLP(
+        dim_in,
+        dim_out,
+        dim_hidden=None,
+        depth=1,
+        activation=nn.Tanh
+):
+    dim_hidden = default(dim_hidden, dim_in)
+    net = []
+    dims = (dim_in, *((dim_hidden,) * depth), dim_out)
+    for ind, (layer_dim_in, layer_dim_out) in enumerate(zip(dims[:-1], dims[1:])):
+        is_last = ind == (len(dims) - 2)
+        net.append(nn.Linear(layer_dim_in, layer_dim_out))
+        if is_last:
+            continue
+        net.append(activation())
+    return nn.Sequential(*net)
+class MaskEstimator(Module):
+    @beartype
+    def __init__(
+            self,
+            dim,
+            dim_inputs: Tuple[int, ...],
+            depth,
+            mlp_expansion_factor=4
+    ):
+        super().__init__()
+        self.dim_inputs = dim_inputs
+        self.to_freqs = ModuleList([])
+        dim_hidden = dim * mlp_expansion_factor
+        for dim_in in dim_inputs:
+            net = []
+            mlp = nn.Sequential(
+                MLP(dim, dim_in * 2, dim_hidden=dim_hidden, depth=depth),
+                nn.GLU(dim=-1)
+            )
+            self.to_freqs.append(mlp)
+    def forward(self, x):
+        x = x.unbind(dim=-2)
+        outs = []
+        for band_features, mlp in zip(x, self.to_freqs):
+            freq_out = mlp(band_features)
+            outs.append(freq_out)
+        return torch.cat(outs, dim=-1)
+# main class
+class MelBandRoformer(Module):
+    @beartype
+    def __init__(
+            self,
+            dim,
+            *,
+            depth,
+            stereo=False,
+            num_stems=1,
+            time_transformer_depth=2,
+            freq_transformer_depth=2,
+            linear_transformer_depth=0,
+            num_bands=60,
+            dim_head=64,
+            heads=8,
+            attn_dropout=0.1,
+            ff_dropout=0.1,
+            flash_attn=True,
+            dim_freqs_in=1025,
+            sample_rate=44100,  # needed for mel filter bank from librosa
+            stft_n_fft=2048,
+            stft_hop_length=512,
+            # 10ms at 44100Hz, from sections 4.1, 4.4 in the paper - @faroit recommends // 2 or // 4 for better reconstruction
+            stft_win_length=2048,
+            stft_normalized=False,
+            stft_window_fn: Optional[Callable] = None,
+            zero_dc = True,
+            mask_estimator_depth=1,
+            multi_stft_resolution_loss_weight=1.,
+            multi_stft_resolutions_window_sizes: Tuple[int, ...] = (4096, 2048, 1024, 512, 256),
+            multi_stft_hop_size=147,
+            multi_stft_normalized=False,
+            multi_stft_window_fn: Callable = torch.hann_window,
+            match_input_audio_length=False,  # if True, pad output tensor to match length of input tensor
+            mlp_expansion_factor=4,
+            use_torch_checkpoint=False,
+            skip_connection=False,
+            use_pope: bool = False,
+            **kwargs
+    ):
+        super().__init__()
+        self.stereo = stereo
+        self.audio_channels = 2 if stereo else 1
+        self.num_stems = num_stems
+        self.use_torch_checkpoint = use_torch_checkpoint
+        self.skip_connection = skip_connection
+        self.layers = ModuleList([])
+        transformer_kwargs = dict(
+            dim=dim,
+            heads=heads,
+            dim_head=dim_head,
+            attn_dropout=attn_dropout,
+            ff_dropout=ff_dropout,
+            flash_attn=flash_attn,
+        )
+        if use_pope:
+            assert _HAS_POPE, "PoPE requested but PoPE_pytorch is not installed"
+            time_pope_embed = PoPE(dim=dim_head, heads=heads)
+            freq_pope_embed = PoPE(dim=dim_head, heads=heads)
+            time_rotary_embed = None
+            freq_rotary_embed = None
+        else:
+            time_rotary_embed = RotaryEmbedding(dim=dim_head)
+            freq_rotary_embed = RotaryEmbedding(dim=dim_head)
+            time_pope_embed = freq_pope_embed = None
+        for _ in range(depth):
+            tran_modules = []
+            if linear_transformer_depth > 0:
+                tran_modules.append(Transformer(depth=linear_transformer_depth, linear_attn=True, **transformer_kwargs))
+            tran_modules.append(
+                Transformer(
+                    depth=time_transformer_depth,
+                    rotary_embed=time_rotary_embed,
+                    pope_embed=time_pope_embed,
+                    **transformer_kwargs
+                )
+            )
+            tran_modules.append(
+                Transformer(
+                    depth=freq_transformer_depth,
+                    rotary_embed=freq_rotary_embed,
+                    pope_embed=freq_pope_embed,
+                    **transformer_kwargs
+                )
+            )
+            self.layers.append(nn.ModuleList(tran_modules))
+        self.stft_window_fn = partial(default(stft_window_fn, torch.hann_window), stft_win_length)
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized
+        )
+        freqs = torch.stft(torch.randn(1, 4096), **self.stft_kwargs, window=torch.ones(stft_n_fft), return_complex=True).shape[1]
+        # create mel filter bank
+        # with librosa.filters.mel as in section 2 of paper
+        mel_filter_bank_numpy = filters.mel(sr=sample_rate, n_fft=stft_n_fft, n_mels=num_bands)
+        mel_filter_bank = torch.from_numpy(mel_filter_bank_numpy)
+        # for some reason, it doesn't include the first freq? just force a value for now
+        mel_filter_bank[0][0] = 1.
+        # In some systems/envs we get 0.0 instead of ~1.9e-18 in the last position,
+        # so let's force a positive value
+        mel_filter_bank[-1, -1] = 1.
+        # binary as in paper (then estimated masks are averaged for overlapping regions)
+        freqs_per_band = mel_filter_bank > 0
+        assert freqs_per_band.any(dim=0).all(), 'all frequencies need to be covered by all bands for now'
+        repeated_freq_indices = repeat(torch.arange(freqs), 'f -> b f', b=num_bands)
+        freq_indices = repeated_freq_indices[freqs_per_band]
+        if stereo:
+            freq_indices = repeat(freq_indices, 'f -> f s', s=2)
+            freq_indices = freq_indices * 2 + torch.arange(2)
+            freq_indices = rearrange(freq_indices, 'f s -> (f s)')
+        self.register_buffer('freq_indices', freq_indices, persistent=False)
+        self.register_buffer('freqs_per_band', freqs_per_band, persistent=False)
+        num_freqs_per_band = reduce(freqs_per_band, 'b f -> b', 'sum')
+        num_bands_per_freq = reduce(freqs_per_band, 'b f -> f', 'sum')
+        self.register_buffer('num_freqs_per_band', num_freqs_per_band, persistent=False)
+        self.register_buffer('num_bands_per_freq', num_bands_per_freq, persistent=False)
+        # band split and mask estimator
+        freqs_per_bands_with_complex = tuple(2 * f * self.audio_channels for f in num_freqs_per_band.tolist())
+        self.band_split = BandSplit(
+            dim=dim,
+            dim_inputs=freqs_per_bands_with_complex
+        )
+        self.mask_estimators = nn.ModuleList([])
+        for _ in range(num_stems):
+            mask_estimator = MaskEstimator(
+                dim=dim,
+                dim_inputs=freqs_per_bands_with_complex,
+                depth=mask_estimator_depth,
+                mlp_expansion_factor=mlp_expansion_factor,
+            )
+            self.mask_estimators.append(mask_estimator)
+        # whether to zero out dc
+        self.zero_dc = zero_dc
+        # for the multi-resolution stft loss
+        self.multi_stft_resolution_loss_weight = multi_stft_resolution_loss_weight
+        self.multi_stft_resolutions_window_sizes = multi_stft_resolutions_window_sizes
+        self.multi_stft_n_fft = stft_n_fft
+        self.multi_stft_window_fn = multi_stft_window_fn
+        self.multi_stft_kwargs = dict(
+            hop_length=multi_stft_hop_size,
+            normalized=multi_stft_normalized
+        )
+        self.match_input_audio_length = match_input_audio_length
+    def forward(
+            self,
+            raw_audio,
+            target=None,
+            active_stem_ids=None,
+            return_loss_breakdown=False
+    ):
+        """
+        einops
+        b - batch
+        f - freq
+        t - time
+        s - audio channel (1 for mono, 2 for stereo)
+        n - number of 'stems'
+        c - complex (2)
+        d - feature dimension
+        """
+        device = raw_audio.device
+        if raw_audio.ndim == 2:
+            raw_audio = rearrange(raw_audio, 'b t -> b 1 t')
+        batch, channels, raw_audio_length = raw_audio.shape
+        istft_length = raw_audio_length if self.match_input_audio_length else None
+        assert (not self.stereo and channels == 1) or (
+                    self.stereo and channels == 2), 'stereo needs to be set to True if passing in audio signal that is stereo (channel dimension of 2). also need to be False if mono (channel dimension of 1)'
+        # to stft
+        raw_audio, batch_audio_channel_packed_shape = pack_one(raw_audio, '* t')
+        stft_window = self.stft_window_fn(device=device)
+        stft_repr = torch.stft(raw_audio, **self.stft_kwargs, window=stft_window, return_complex=True)
+        stft_repr = torch.view_as_real(stft_repr)
+        stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c')
+        # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
+        stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c')
+        # index out all frequencies for all frequency ranges across bands ascending in one go
+        batch_arange = torch.arange(batch, device=device)[..., None]
+        # account for stereo
+        x = stft_repr[batch_arange, self.freq_indices]
+        # fold the complex (real and imag) into the frequencies dimension
+        x = rearrange(x, 'b f t c -> b t (f c)')
+        if self.use_torch_checkpoint:
+            x = checkpoint(self.band_split, x, use_reentrant=False)
+        else:
+            x = self.band_split(x)
+        # axial / hierarchical attention
+        store = [None] * len(self.layers)
+        for i, transformer_block in enumerate(self.layers):
+            if len(transformer_block) == 3:
+                linear_transformer, time_transformer, freq_transformer = transformer_block
+                x, ft_ps = pack([x], 'b * d')
+                if self.use_torch_checkpoint:
+                    x = checkpoint(linear_transformer, x, use_reentrant=False)
+                else:
+                    x = linear_transformer(x)
+                x, = unpack(x, ft_ps, 'b * d')
+            else:
+                time_transformer, freq_transformer = transformer_block
+            if self.skip_connection:
+                # Sum all previous
+                for j in range(i):
+                    x = x + store[j]
+            x = rearrange(x, 'b t f d -> b f t d')
+            x, ps = pack([x], '* t d')
+            if self.use_torch_checkpoint:
+                x = checkpoint(time_transformer, x, use_reentrant=False)
+            else:
+                x = time_transformer(x)
+            x, = unpack(x, ps, '* t d')
+            x = rearrange(x, 'b f t d -> b t f d')
+            x, ps = pack([x], '* f d')
+            if self.use_torch_checkpoint:
+                x = checkpoint(freq_transformer, x, use_reentrant=False)
+            else:
+                x = freq_transformer(x)
+            x, = unpack(x, ps, '* f d')
+            if self.skip_connection:
+                store[i] = x
+        if active_stem_ids is None:
+            heads = self.mask_estimators
+            stem_ids = list(range(len(self.mask_estimators)))
+        else:
+            heads = [self.mask_estimators[i] for i in active_stem_ids]
+            stem_ids = active_stem_ids
+        num_stems = len(heads)
+        if self.use_torch_checkpoint:
+            masks = torch.stack([checkpoint(fn, x, use_reentrant=False) for fn in heads], dim=1)
+        else:
+            masks = torch.stack([fn(x) for fn in heads], dim=1)
+        masks = rearrange(masks, 'b n t (f c) -> b n f t c', c=2)
+        # modulate frequency representation
+        stft_repr = rearrange(stft_repr, 'b f t c -> b 1 f t c')
+        # complex number multiplication
+        stft_repr = torch.view_as_complex(stft_repr)
+        masks = torch.view_as_complex(masks)
+        masks = masks.type(stft_repr.dtype)
+        # need to average the estimated mask for the overlapped frequencies
+        scatter_indices = repeat(self.freq_indices, 'f -> b n f t', b=batch, n=num_stems, t=stft_repr.shape[-1])
+        stft_repr_expanded_stems = repeat(stft_repr, 'b 1 ... -> b n ...', n=num_stems)
+        masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_(2, scatter_indices, masks)
+        denom = repeat(self.num_bands_per_freq, 'f -> (f r) 1', r=channels)
+        masks_averaged = masks_summed / denom.clamp(min=1e-8)
+        # modulate stft repr with estimated mask
+        stft_repr = stft_repr * masks_averaged
+        # istft
+        stft_repr = rearrange(stft_repr, 'b n (f s) t -> (b n s) f t', s=self.audio_channels)
+        if self.zero_dc:
+            # whether to dc filter
+            stft_repr = stft_repr.index_fill(1, tensor(0, device = device), 0.)
+        recon_audio = torch.istft(stft_repr, **self.stft_kwargs, window=stft_window, return_complex=False,
+                                  length=istft_length)
+        recon_audio = rearrange(recon_audio, '(b n s) t -> b n s t', b=batch, s=self.audio_channels, n=num_stems)
+        if num_stems == 1:
+            recon_audio = rearrange(recon_audio, 'b 1 s t -> b s t')
+        # if a target is passed in, calculate loss for learning
+        if not exists(target):
+            return recon_audio
+        if self.num_stems > 1:
+            assert target.ndim == 4 and target.shape[1] == self.num_stems
+        if target.ndim == 2:
+            target = rearrange(target, '... t -> ... 1 t')
+        target = target[..., :recon_audio.shape[-1]]  # protect against lost length on istft
+        target_sel = target[:, stem_ids]
+        loss = F.l1_loss(recon_audio, target_sel)
+        multi_stft_resolution_loss = 0.
+        for window_size in self.multi_stft_resolutions_window_sizes:
+            res_stft_kwargs = dict(
+                n_fft=max(window_size, self.multi_stft_n_fft),  # not sure what n_fft is across multi resolution stft
+                win_length=window_size,
+                return_complex=True,
+                window=self.multi_stft_window_fn(window_size, device=device),
+                **self.multi_stft_kwargs,
+            )
+            recon_Y = torch.stft(
+                rearrange(recon_audio, 'b n s t -> (b n s) t'),
+                **res_stft_kwargs
+            )
+            target_Y = torch.stft(
+                rearrange(target_sel, 'b n s t -> (b n s) t'),
+                **res_stft_kwargs
+            )
+            multi_stft_resolution_loss = multi_stft_resolution_loss + F.l1_loss(recon_Y, target_Y)
+        weighted_multi_resolution_loss = multi_stft_resolution_loss * self.multi_stft_resolution_loss_weight
+        total_loss = loss + weighted_multi_resolution_loss
+        if not return_loss_breakdown:
+            return total_loss
         return total_loss, (loss, multi_stft_resolution_loss)