rrivera1849
/

LUAR-MUD

@@ -7,7 +7,10 @@
     "AutoModel": "model.LUAR"
   },
   "embedding_size": 512,
   "model_type": "LUAR",
   "torch_dtype": "float32",
-  "transformers_version": "4.33.2"
 }

     "AutoModel": "model.LUAR"
   },
   "embedding_size": 512,
+  "k_bucket_size": 1024,
   "model_type": "LUAR",
+  "q_bucket_size": 512,
   "torch_dtype": "float32",
+  "transformers_version": "4.33.2",
+  "use_memory_efficient_attention": false
 }

config.py CHANGED Viewed

@@ -6,7 +6,13 @@ class LUARConfig(PretrainedConfig):
     def __init__(self,
         embedding_size: int = 512,
         **kwargs,
     ):
         self.embedding_size = embedding_size
         super().__init__(**kwargs)

     def __init__(self,
         embedding_size: int = 512,
+        use_memory_efficient_attention=False,
+        q_bucket_size=512,
+        k_bucket_size=1024,
         **kwargs,
     ):
         self.embedding_size = embedding_size
+        self.use_memory_efficient_attention = use_memory_efficient_attention
+        self.q_bucket_size = q_bucket_size
+        self.k_bucket_size = k_bucket_size
         super().__init__(**kwargs)

model.py CHANGED Viewed

@@ -1,29 +1,135 @@
 import math
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, reduce, repeat
 from transformers import AutoModel, PreTrainedModel
 from .config import LUARConfig
 class SelfAttention(nn.Module):
     """Implements Dot-Product Self-Attention as used in "Attention is all You Need".
     """
-    def __init__(self):
         super(SelfAttention, self).__init__()
     def forward(self, k, q, v):
-        if hasattr(F, "scaled_dot_product_attention") and torch.cuda.is_available():
-            with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_mem_efficient=True):
-                return F.scaled_dot_product_attention(k, q, v)
         else:
             d_k = q.size(-1)
             scores = torch.matmul(k, q.transpose(-2, -1)) / math.sqrt(d_k)
             p_attn = F.softmax(scores, dim=-1)
             return torch.matmul(p_attn, v)
 class LUAR(PreTrainedModel):
@@ -34,7 +140,11 @@ class LUAR(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.create_transformer()
-        self.attn_fn = SelfAttention()
         self.linear = nn.Linear(self.hidden_size, config.embedding_size)
     def create_transformer(self):

 import math
+from functools import partial
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 from einops import rearrange, reduce, repeat
+from torch.utils.checkpoint import checkpoint
 from transformers import AutoModel, PreTrainedModel
 from .config import LUARConfig
+# Adapted LucidRains impl. of Memory Efficient Attention
+# https://github.com/lucidrains/memory-efficient-attention-pytorch
+def exists(val):
+    return val is not None
+def summarize_qkv_chunk(
+    q, k, v,
+    mask
+):
+    """Dot-Product Attention for a chunk of queries, keys, and values.
+    """
+    weight = torch.einsum('b h i d, b h j d -> b h i j', q, k)
+    if exists(mask):
+        # HuggingFace masks have to be added:
+        weight += mask
+    weight_max = weight.amax(dim = -1, keepdim = True).detach()
+    weight = weight - weight_max
+    exp_weight = weight.exp()
+    weighted_value = torch.einsum('b h i j, b h j d -> b h i d', exp_weight, v)
+    return exp_weight.sum(dim = -1), weighted_value, rearrange(weight_max, '... 1 -> ...')
+checkpointed_summarize_qkv_chunk = partial(checkpoint, summarize_qkv_chunk)
+def memory_efficient_attention(
+    q, k, v,
+    mask = None,
+    q_bucket_size = 512,
+    k_bucket_size = 1024,
+    eps = 1e-8
+):
+    scale = q.shape[-1] ** -0.5
+    q = q * scale
+    # function
+    needs_backwards = q.requires_grad or k.requires_grad or v.requires_grad
+    summarize_qkv_fn = checkpointed_summarize_qkv_chunk if needs_backwards else summarize_qkv_chunk
+    # chunk all the inputs
+    q_chunks = q.split(q_bucket_size, dim = -2)
+    k_chunks = k.split(k_bucket_size, dim = -2)
+    v_chunks = v.split(k_bucket_size, dim = -2)
+    mask_chunks = mask.split(k_bucket_size, dim = -1) if exists(mask) else ((None,) * len(k_chunks))
+    # loop through all chunks and accumulate
+    out = []
+    for q_index, q_chunk in enumerate(q_chunks):
+        exp_weights = []
+        weighted_values = []
+        weight_maxes = []
+        for k_index, (k_chunk, v_chunk, mask_chunk) in enumerate(zip(k_chunks, v_chunks, mask_chunks)):
+            exp_weight_chunk, weighted_value_chunk, weight_max_chunk = summarize_qkv_fn(
+                q_chunk,
+                k_chunk,
+                v_chunk,
+                mask_chunk,
+            )
+            exp_weights.append(exp_weight_chunk)
+            weighted_values.append(weighted_value_chunk)
+            weight_maxes.append(weight_max_chunk)
+        exp_weights = torch.stack(exp_weights, dim = -1)
+        weighted_values = torch.stack(weighted_values, dim = -1)
+        weight_maxes = torch.stack(weight_maxes, dim = -1)
+        global_max = weight_maxes.amax(dim = -1, keepdim = True)
+        renorm_factor = (weight_maxes - global_max).exp().detach()
+        exp_weights = exp_weights * renorm_factor
+        weighted_values = weighted_values * rearrange(renorm_factor, '... c -> ... 1 c')
+        all_values = weighted_values.sum(dim = -1)
+        all_weights = exp_weights.sum(dim = -1)
+        normalized_values = all_values / (rearrange(all_weights, '... -> ... 1') + eps)
+        out.append(normalized_values)
+    return torch.cat(out, dim=-2)
 class SelfAttention(nn.Module):
     """Implements Dot-Product Self-Attention as used in "Attention is all You Need".
     """
+    def __init__(
+            self,
+            memory_efficient_attention=False,
+            q_bucket_size=512,
+            k_bucket_size=1024,
+        ):
         super(SelfAttention, self).__init__()
+        self.use_memory_efficient_attention = memory_efficient_attention
+        self.q_bucket_size = q_bucket_size
+        self.k_bucket_size = k_bucket_size
     def forward(self, k, q, v):
+        if self.use_memory_efficient_attention:
+            q, k, v = map(
+                lambda t: rearrange(t, 'b n (h d) -> b h n d', h = 12),
+                (q, k, v)
+            )
+            out = memory_efficient_attention(
+                q, k, v,
+                q_bucket_size=self.q_bucket_size,
+                k_bucket_size=self.k_bucket_size
+            )
+            out = rearrange(out, 'b h n d -> b n (h d)')
+            return out
         else:
             d_k = q.size(-1)
             scores = torch.matmul(k, q.transpose(-2, -1)) / math.sqrt(d_k)
             p_attn = F.softmax(scores, dim=-1)
             return torch.matmul(p_attn, v)
 class LUAR(PreTrainedModel):
     def __init__(self, config):
         super().__init__(config)
         self.create_transformer()
+        self.attn_fn = SelfAttention(
+            config.use_memory_efficient_attention,
+            config.q_bucket_size,
+            config.k_bucket_size,
+        )
         self.linear = nn.Linear(self.hidden_size, config.embedding_size)
     def create_transformer(self):