Spaces:

telegram191
/

MelodyFlow

Sleeping

App Files Files Community

telegram191 commited on Mar 13

Commit

fd3ad60

verified ·

1 Parent(s): 353793a

Make xformers optional to reduce Space build failures

Browse files

Files changed (1) hide show

audiocraft/modules/transformer.py +17 -5

audiocraft/modules/transformer.py CHANGED Viewed

@@ -20,7 +20,10 @@ import torch
 import torch.nn as nn
 from torch.nn import functional as F
 from torch.utils.checkpoint import checkpoint as torch_checkpoint
-from xformers import ops
 from .rope import RotaryEmbedding
 from .streaming import StreamingModule
@@ -31,7 +34,9 @@ _efficient_attention_backend: str = 'torch'
 def set_efficient_attention_backend(backend: str = 'torch'):
     # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
     global _efficient_attention_backend
-    assert _efficient_attention_backend in ['xformers', 'torch']
     _efficient_attention_backend = backend
@@ -236,7 +241,7 @@ class StreamingMultiheadAttention(StreamingModule):
         # We actually return a bias for the attention score, as this has the same
         # convention both in the builtin MHA in Pytorch, and Xformers functions.
         time_dim = _get_attention_time_dimension(self.memory_efficient)
-        if self.memory_efficient:
             from xformers.ops import LowerTriangularMask
             if current_steps == 1:
                 # If we only have one step, then we do not need a mask.
@@ -373,7 +378,10 @@ class StreamingMultiheadAttention(StreamingModule):
                     else:
                         bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
-                    q, k, v = ops.unbind(packed, dim=2)
                 else:
                     embed_dim = self.embed_dim
                     per_head_dim = (embed_dim // self.num_heads)
@@ -425,7 +433,11 @@ class StreamingMultiheadAttention(StreamingModule):
                     x = torch.nn.functional.scaled_dot_product_attention(
                         q, k, v, is_causal=self.causal, attn_mask=attn_mask, dropout_p=p)
                 else:
-                    x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
             else:
                 # We include the dot product as float32, for consistency
                 # with the other implementations that include that step

 import torch.nn as nn
 from torch.nn import functional as F
 from torch.utils.checkpoint import checkpoint as torch_checkpoint
+try:
+    from xformers import ops
+except Exception:
+    ops = None
 from .rope import RotaryEmbedding
 from .streaming import StreamingModule
 def set_efficient_attention_backend(backend: str = 'torch'):
     # Using torch by default, it seems a bit faster on older P100 GPUs (~20% faster).
     global _efficient_attention_backend
+    assert backend in ['xformers', 'torch']
+    if backend == 'xformers' and ops is None:
+        backend = 'torch'
     _efficient_attention_backend = backend
         # We actually return a bias for the attention score, as this has the same
         # convention both in the builtin MHA in Pytorch, and Xformers functions.
         time_dim = _get_attention_time_dimension(self.memory_efficient)
+        if self.memory_efficient and _efficient_attention_backend == 'xformers' and ops is not None:
             from xformers.ops import LowerTriangularMask
             if current_steps == 1:
                 # If we only have one step, then we do not need a mask.
                     else:
                         bound_layout = "b t p h d"
                     packed = rearrange(projected, f"b t (p h d) -> {bound_layout}", p=3, h=self.num_heads)
+                    if ops is None:
+                        q, k, v = torch.unbind(packed, dim=2)
+                    else:
+                        q, k, v = ops.unbind(packed, dim=2)
                 else:
                     embed_dim = self.embed_dim
                     per_head_dim = (embed_dim // self.num_heads)
                     x = torch.nn.functional.scaled_dot_product_attention(
                         q, k, v, is_causal=self.causal, attn_mask=attn_mask, dropout_p=p)
                 else:
+                    if ops is None:
+                        x = torch.nn.functional.scaled_dot_product_attention(
+                            q, k, v, is_causal=self.causal, attn_mask=attn_mask, dropout_p=p)
+                    else:
+                        x = ops.memory_efficient_attention(q, k, v, attn_mask, p=p)
             else:
                 # We include the dot product as float32, for consistency
                 # with the other implementations that include that step