kuleshov-group
/

bd3lm-owt-block_size8

@@ -1,5 +1,6 @@
 {
   "_name_or_path": "kuleshov-group/bd3lm-owt-block_size8",
   "architectures": [
     "BD3LM"
   ],
@@ -9,6 +10,7 @@
     "AutoModelForMaskedLM": "modeling_bd3lm.BD3LM"
   },
   "block_size": 8,
   "cond_dim": 128,
   "cross_attn": true,
   "dropout": 0.1,

 {
   "_name_or_path": "kuleshov-group/bd3lm-owt-block_size8",
+  "adaln": true,
   "architectures": [
     "BD3LM"
   ],
     "AutoModelForMaskedLM": "modeling_bd3lm.BD3LM"
   },
   "block_size": 8,
+  "causal": false,
   "cond_dim": 128,
   "cross_attn": true,
   "dropout": 0.1,

configuration_bd3lm.py CHANGED Viewed

@@ -15,7 +15,9 @@ class BD3LMConfig(transformers.PretrainedConfig):
     vocab_size: int = 50258,
     model_length: int = 1024,
     cross_attn: bool = True,
     attn_backend: str = 'flex',
     hidden_dim: int = 768,
     cond_dim: int = 129,
     n_blocks: int = 12,
@@ -29,7 +31,9 @@ class BD3LMConfig(transformers.PretrainedConfig):
     super().__init__(**kwargs)
     self.block_size = block_size
     self.cross_attn = cross_attn
     self.attn_backend = attn_backend
     self.vocab_size = vocab_size
     self.model_length = model_length
     self.hidden_dim = hidden_dim

     vocab_size: int = 50258,
     model_length: int = 1024,
     cross_attn: bool = True,
+    adaln: bool = True,
     attn_backend: str = 'flex',
+    causal: bool = False,
     hidden_dim: int = 768,
     cond_dim: int = 129,
     n_blocks: int = 12,
     super().__init__(**kwargs)
     self.block_size = block_size
     self.cross_attn = cross_attn
+    self.adaln = adaln
     self.attn_backend = attn_backend
+    self.causal = causal
     self.vocab_size = vocab_size
     self.model_length = model_length
     self.hidden_dim = hidden_dim

modeling_bd3lm.py CHANGED Viewed

@@ -296,14 +296,16 @@ def regular_attention_multi_headed(qkv):
 class DDiTBlock(nn.Module):
-  def __init__(self, n, block_size, dim, n_heads, cond_dim, mlp_ratio=4,
-               dropout=0.1, attn_backend='sdpa'):
     super().__init__()
     self.n = n
     self.block_size = block_size
     self.n_heads = n_heads
     self.attn_backend = attn_backend
     self.kv_cache = None
     self.norm1 = LayerNorm(dim)
     self.attn_qkv = nn.Linear(dim, 3 * dim, bias=False)
@@ -317,10 +319,11 @@ class DDiTBlock(nn.Module):
       nn.Linear(mlp_ratio * dim, dim, bias=True))
     self.dropout2 = nn.Dropout(dropout)
     self.dropout = dropout
-    self.adaLN_modulation = nn.Linear(cond_dim, 6 * dim, bias=True)
-    self.adaLN_modulation.weight.data.zero_()
-    self.adaLN_modulation.bias.data.zero_()
   def _get_bias_dropout_scale(self):
     if self.training:
@@ -331,13 +334,18 @@ class DDiTBlock(nn.Module):
   def get_qkv(self, x, rotary_cos_sin, store_kv=False):
     # compute qkv (potentially use cache)
     if self.kv_cache is not None:
-      new_qkv = self.attn_qkv(x[:, -self.block_size:])
-      qkv = torch.cat((self.kv_cache, new_qkv), dim=1)
     else:
       qkv = self.attn_qkv(x)
     # store kv cache in a sliding window (can't exceed context len)
     if store_kv:
-      self.kv_cache = qkv[:, -(self.n-self.block_size):]
     qkv = einops.rearrange(
       qkv,
@@ -359,7 +367,7 @@ class DDiTBlock(nn.Module):
       key=qkv[:, :, 1],
       value=qkv[:, :, 2],
       attn_mask=mask,
-      is_causal=False,
       scale=1 / math.sqrt(scale))
     x = x.transpose(1, 2)
     x = einops.rearrange(x, 'b s h d -> b s (h d)')
@@ -376,12 +384,16 @@ class DDiTBlock(nn.Module):
               sample_mode=False, store_kv=False):
     bias_dropout_scale_fn = self._get_bias_dropout_scale()
-    (shift_msa, scale_msa, gate_msa, shift_mlp,
-     scale_mlp, gate_mlp) = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
     # attention operation
     x_skip = x
-    x = modulate_fused(self.norm1(x), shift_msa, scale_msa)
     # get qkvs
     if mask is not None and not sample_mode:
@@ -398,18 +410,28 @@ class DDiTBlock(nn.Module):
       x = self.cross_attn(x, qkv, mask=mask)
     else:
       raise ValueError('Unknown attention backend')
-    x = bias_dropout_scale_fn(self.attn_out(x),
                               None,
                               gate_msa,
                               x_skip,
                               self.dropout)
-    # mlp operation
-    x = bias_dropout_scale_fn(
-      self.mlp(modulate_fused(
-        self.norm2(x), shift_mlp, scale_mlp)),
-      None, gate_mlp, x, self.dropout)
     return x
@@ -424,23 +446,28 @@ class EmbeddingLayer(nn.Module):
 class DDitFinalLayer(nn.Module):
-  def __init__(self, hidden_size, out_channels, cond_dim):
     super().__init__()
     self.norm_final = LayerNorm(hidden_size)
     self.linear = nn.Linear(hidden_size, out_channels)
     self.linear.weight.data.zero_()
     self.linear.bias.data.zero_()
-    self.adaLN_modulation = nn.Linear(cond_dim,
-                                      2 * hidden_size,
-                                      bias=True)
-    self.adaLN_modulation.weight.data.zero_()
-    self.adaLN_modulation.bias.data.zero_()
   def forward(self, x, c):
-    shift, scale = self.adaLN_modulation(c)[:, None].chunk(2, dim=2)
-    x = modulate_fused(self.norm_final(x), shift, scale)
     x = self.linear(x)
     return x
@@ -460,8 +487,10 @@ class DITBackbone(nn.Module):
     self.vocab_embed = EmbeddingLayer(
       config.hidden_dim,
       config.vocab_size)
-    self.sigma_map = TimestepEmbedder(
-      config.cond_dim)
     self.rotary_emb = Rotary(
       config.hidden_dim // config.n_heads)
@@ -472,14 +501,17 @@ class DITBackbone(nn.Module):
                               config.hidden_dim,
                               config.n_heads,
                               config.cond_dim,
                               dropout=config.dropout,
                               attn_backend=config.attn_backend,))
     self.blocks = nn.ModuleList(blocks)
     self.output_layer = DDitFinalLayer(
       config.hidden_dim,
       config.vocab_size,
-      config.cond_dim)
     if self.cross_attn:
       self.gen_mask(config.model_length, self.block_size, attn_backend=config.attn_backend)
     self.precision = torch.float32
@@ -505,21 +537,31 @@ class DITBackbone(nn.Module):
   def forward(self, indices, sigma, sample_mode=False,
              store_kv=False, output_hidden_states=False):
-    if not self.config.time_conditioning:
       sigma = torch.zeros_like(sigma)
     all_hidden_states = []
     x = self.vocab_embed(indices)
     if output_hidden_states:
       all_hidden_states.append(x)
-    c = F.silu(self.sigma_map(sigma))
     if self.cross_attn:
       n = self.mask.shape[-1] // 2
-      rotary_cos_sin = self.rotary_emb(x[:, :n])
-      mask = self.mask.to(x.device)
       # use block-causal mask only during sampling
       if sample_mode:
-        mask = mask[
-          n:n+x.shape[1], n:n+x.shape[1]]
     else:
       mask = None
       rotary_cos_sin = self.rotary_emb(x)
@@ -558,10 +600,16 @@ class BD3LM(transformers.PreTrainedModel):
       self.register_buffer(
         'sampling_eps_max',
         torch.tensor(config.sampling_eps_max))
-  def reset_kv_cache(self):
     for block in self.backbone.blocks:
-      block.kv_cache = None
   def forward(
       self,

 class DDiTBlock(nn.Module):
+  def __init__(self, n, block_size, dim, n_heads, cond_dim, causal=False,
+               mlp_ratio=4, dropout=0.1, adaln=True, attn_backend='sdpa'):
     super().__init__()
     self.n = n
     self.block_size = block_size
     self.n_heads = n_heads
     self.attn_backend = attn_backend
     self.kv_cache = None
+    self.cache_idx = 0
+    self.causal = causal
     self.norm1 = LayerNorm(dim)
     self.attn_qkv = nn.Linear(dim, 3 * dim, bias=False)
       nn.Linear(mlp_ratio * dim, dim, bias=True))
     self.dropout2 = nn.Dropout(dropout)
     self.dropout = dropout
+    self.adaln = adaln
+    if self.adaln:
+      self.adaLN_modulation = nn.Linear(cond_dim, 6 * dim, bias=True)
+      self.adaLN_modulation.weight.data.zero_()
+      self.adaLN_modulation.bias.data.zero_()
   def _get_bias_dropout_scale(self):
     if self.training:
   def get_qkv(self, x, rotary_cos_sin, store_kv=False):
     # compute qkv (potentially use cache)
     if self.kv_cache is not None:
+      new_qkv = self.attn_qkv(x)
+      self.kv_cache[:, self.cache_idx:self.cache_idx+self.block_size] = new_qkv
+      qkv = self.kv_cache[:, :self.cache_idx+self.block_size].clone()
     else:
       qkv = self.attn_qkv(x)
     # store kv cache in a sliding window (can't exceed context len)
     if store_kv:
+      self.cache_idx += self.block_size
+      if self.cache_idx >= self.n:
+        # left-shift the cache
+        self.cache_idx = self.n - self.block_size
+        self.kv_cache[:, :-self.block_size] = self.kv_cache[:, self.block_size:].clone()
     qkv = einops.rearrange(
       qkv,
       key=qkv[:, :, 1],
       value=qkv[:, :, 2],
       attn_mask=mask,
+      is_causal=self.causal,
       scale=1 / math.sqrt(scale))
     x = x.transpose(1, 2)
     x = einops.rearrange(x, 'b s h d -> b s (h d)')
               sample_mode=False, store_kv=False):
     bias_dropout_scale_fn = self._get_bias_dropout_scale()
+    if self.adaln:
+      (shift_msa, scale_msa, gate_msa, shift_mlp,
+      scale_mlp, gate_mlp) = self.adaLN_modulation(c)[:, None].chunk(6, dim=2)
     # attention operation
     x_skip = x
+    if self.adaln:
+      x = modulate_fused(self.norm1(x), shift_msa, scale_msa)
+    else:
+      x = self.norm1(x)
     # get qkvs
     if mask is not None and not sample_mode:
       x = self.cross_attn(x, qkv, mask=mask)
     else:
       raise ValueError('Unknown attention backend')
+    if self.kv_cache is not None:
+      x = x[:, -self.block_size:]
+    # mlp operation
+    if self.adaln:
+      x = bias_dropout_scale_fn(self.attn_out(x),
                               None,
                               gate_msa,
                               x_skip,
                               self.dropout)
+      x = bias_dropout_scale_fn(
+        self.mlp(modulate_fused(
+          self.norm2(x), shift_mlp, scale_mlp)),
+        None, gate_mlp, x, self.dropout)
+    else:
+      x = bias_dropout_scale_fn(self.attn_out(x),
+                              None, torch.ones_like(x), x_skip, self.dropout)
+      x = bias_dropout_scale_fn(
+        self.mlp(self.norm2(x)),
+        None, torch.ones_like(x), x, self.dropout)
     return x
 class DDitFinalLayer(nn.Module):
+  def __init__(self, hidden_size, out_channels, cond_dim, adaln=True):
     super().__init__()
     self.norm_final = LayerNorm(hidden_size)
     self.linear = nn.Linear(hidden_size, out_channels)
     self.linear.weight.data.zero_()
     self.linear.bias.data.zero_()
+    self.adaln = adaln
+    if self.adaln:
+      self.adaLN_modulation = nn.Linear(cond_dim,
+                                        2 * hidden_size,
+                                        bias=True)
+      self.adaLN_modulation.weight.data.zero_()
+      self.adaLN_modulation.bias.data.zero_()
   def forward(self, x, c):
+    if self.adaln:
+      shift, scale = self.adaLN_modulation(c)[:, None].chunk(2, dim=2)
+      x = modulate_fused(self.norm_final(x), shift, scale)
+    else:
+      x = self.norm_final(x)
     x = self.linear(x)
     return x
     self.vocab_embed = EmbeddingLayer(
       config.hidden_dim,
       config.vocab_size)
+    self.adaln = config.adaln
+    if self.adaln:
+      self.sigma_map = TimestepEmbedder(
+        config.cond_dim)
     self.rotary_emb = Rotary(
       config.hidden_dim // config.n_heads)
                               config.hidden_dim,
                               config.n_heads,
                               config.cond_dim,
+                              causal=config.causal,
                               dropout=config.dropout,
+                              adaln=config.adaln,
                               attn_backend=config.attn_backend,))
     self.blocks = nn.ModuleList(blocks)
     self.output_layer = DDitFinalLayer(
       config.hidden_dim,
       config.vocab_size,
+      config.cond_dim,
+      adaln=config.adaln)
     if self.cross_attn:
       self.gen_mask(config.model_length, self.block_size, attn_backend=config.attn_backend)
     self.precision = torch.float32
   def forward(self, indices, sigma, sample_mode=False,
              store_kv=False, output_hidden_states=False):
+    if not self.config.time_conditioning and self.adaln:
       sigma = torch.zeros_like(sigma)
     all_hidden_states = []
     x = self.vocab_embed(indices)
     if output_hidden_states:
       all_hidden_states.append(x)
+    c = None
+    if self.adaln:
+      c = F.silu(self.sigma_map(sigma))
     if self.cross_attn:
       n = self.mask.shape[-1] // 2
       # use block-causal mask only during sampling
       if sample_mode:
+        if self.blocks[0].kv_cache is not None:
+          mask = None
+          accum_length = self.blocks[0].cache_idx + self.block_size
+          # positional encodings for cache
+          x_full = torch.zeros((
+            x.shape[0], accum_length, x.shape[2]), device=x.device)
+          rotary_cos_sin = self.rotary_emb(x_full)
+        else:
+          mask = self.mask.to(x.device)
+          rotary_cos_sin = self.rotary_emb(x[:, :n])
+          mask = mask[
+            n:n+x.shape[1], n:n+x.shape[1]]
     else:
       mask = None
       rotary_cos_sin = self.rotary_emb(x)
       self.register_buffer(
         'sampling_eps_max',
         torch.tensor(config.sampling_eps_max))
+  def reset_kv_cache(self, eval_batch_size=1):
     for block in self.backbone.blocks:
+      block.kv_cache = torch.zeros(
+        eval_batch_size,
+        self.config.model_length,
+        self.config.hidden_dim * 3,
+        device='cuda',
+        dtype=torch.bfloat16)
+      block.cache_idx = 0
   def forward(
       self,