ChatterjeeLab
/

MadSBM

Model card Files Files and versions

xet

Community

Shrey Goel commited on Jan 27

Commit

0fa2d2b

1 Parent(s): 6154b48

cleaned training code

Browse files

Files changed (2) hide show

src/madsbm/wt_peptide/control_field.py +2 -58
src/madsbm/wt_peptide/sbm_module.py +1 -71

src/madsbm/wt_peptide/control_field.py CHANGED Viewed

@@ -98,13 +98,7 @@ class PeptideControlField(nn.Module):
         self.embed_model.eval()
         for param in self.embed_model.parameters():
             param.requires_grad = False
-        # # Unfreeze QKV in last few encoder layers
-        # encoder_layers = self.embed_model.esm.encoder.layer
-        # for layer in encoder_layers[-cfg.training.n_unfrozen:]:
-        #     for param in layer.parameters():
-        #         param.requires_grad = True
         self.time_embed = TimeEmbedding(
             hidden_dim=cfg.time_embed.time_dim,
             fourier_dim=cfg.time_embed.fourier_dim,
@@ -118,10 +112,6 @@ class PeptideControlField(nn.Module):
         self.final_norm = nn.LayerNorm(cfg.model.hidden_dim, eps=1e-6)
-        # self.output_proj = self.embed_model.lm_head
-        # for param in self.output_proj.parameters():
-        #     param.requires_grad = False
         self.output_proj = nn.Linear(cfg.model.hidden_dim, self.tokenizer.vocab_size)
         nn.init.zeros_(self.output_proj.weight)
         nn.init.zeros_(self.output_proj.bias)
@@ -150,50 +140,4 @@ class PeptideControlField(nn.Module):
             "dit": logits,
             "madsbm": u_base + logits
         }
-    # def forward(self, t, xt, attention_mask):
-    #     outs = self.embed_model(input_ids=xt, attention_mask=attention_mask, output_hidden_states=True)
-    #     h = outs.hidden_states[-1]
-    #     t_emb = self.time_embed(t)  # [B, time_dim]
-    #     # Transformer head (key_padding_mask=True for pads)
-    #     key_padding_mask = (attention_mask == 0)  # (B, L) bool
-    #     for dit_block in self.blocks:
-    #         h = dit_block(h, t_emb, key_padding_mask=key_padding_mask)
-    #     # Final norm + projection to vocab logits
-    #     h = self.final_norm(h)  # [B, L, hidden_dim]
-    #     logits = self.output_proj(h)  # [B, L, V]
-    #     return logits
-    # def forward(self, xt, attention_mask, t):
-    #     with torch.no_grad():
-    #         base_out = self.embed_model(
-    #             input_ids=xt,
-    #             attention_mask=attention_mask,
-    #             output_hidden_states=True
-    #         )
-    #         logits_base = base_out.logits
-    #         h_base = base_out.hidden_states[-1]
-    #         norm = self.token_norm_sqrd.view(1,1,-1)  # 1, 1, V
-    #         log_R0 = (self.beta1 * logits_base) - (self.beta2 * norm)
-    #     t_emb = self.time_embed(t)  # [B, time_dim]
-    #     key_padding_mask = (attention_mask == 0)  # (B, L) bool
-    #     h_ctrl = h_base
-    #     for dit_block in self.blocks:
-    #         h_ctrl = dit_block(h_ctrl, t_emb, key_padding_mask=key_padding_mask)
-    #     h_ctrl = self.final_norm(h_ctrl)
-    #     u_theta = self.output_proj(h_ctrl)
-    #     tot_logits = log_R0 + u_theta
-    #     return tot_logits, u_theta

         self.embed_model.eval()
         for param in self.embed_model.parameters():
             param.requires_grad = False
         self.time_embed = TimeEmbedding(
             hidden_dim=cfg.time_embed.time_dim,
             fourier_dim=cfg.time_embed.fourier_dim,
         self.final_norm = nn.LayerNorm(cfg.model.hidden_dim, eps=1e-6)
         self.output_proj = nn.Linear(cfg.model.hidden_dim, self.tokenizer.vocab_size)
         nn.init.zeros_(self.output_proj.weight)
         nn.init.zeros_(self.output_proj.bias)
             "dit": logits,
             "madsbm": u_base + logits
         }

src/madsbm/wt_peptide/sbm_module.py CHANGED Viewed

@@ -31,19 +31,13 @@ class MadSBM(pl.LightningModule):
         for param in self.embed_model.parameters():
             param.requires_grad = False
-        self.beta = 1.0 / self.config.model.hidden_dim
-        # self.L = config.data.max_seq_len
-        # self.V = self.vocab_size
-        # self.log_R0 = - math.log(self.L * self.V) # uninformed generator is constant
         self.time_schedule = config.time_embed.time_schedule
         self.anneal_frac = config.time_embed.anneal_frac
         self.eps = float(config.time_embed.min_time)
         self.t_max = 1.0 - self.eps
-    # -------# Forward Pass #-------- #
     def forward(self, input_ids, attention_mask, t):
         return self.model(xt=input_ids, attention_mask=attention_mask, t=t)
@@ -76,32 +70,8 @@ class MadSBM(pl.LightningModule):
         loss = sample_loss.mean()
         ppl = torch.exp(loss)
-        _print(f'loss: {loss}')
-        _print(f'ppl: {ppl}')
         return loss, ppl, max_u_logit, max_esm_logit
-    # def step(self, batch):
-    #     x1 = batch['input_ids']
-    #     attn_mask = batch['attention_mask']
-    #     maskable = self.is_maskable(x1)
-    #     t = self.sample_t(x1)
-    #     xt = self.noise_seq(x1, t, maskable_mask=maskable)
-    #     u_theta = self.forward(xt, attn_mask, t)
-    #     b, l, v_target = self.compute_target(x1, xt, t, maskable_mask=maskable)
-    #     loss, ppl = self.compute_loss(u_theta, v_target, x1, b, l)
-    #     _print(f'loss: {loss}')
-    #     _print(f'ppl: {ppl}')
-    #     return loss, ppl
-    # -------# Main Training Logic #-------- #
     def noise_seq(self, x1, t, maskable_mask):
         B, L = x1.shape
         t = t.unsqueeze(1) # B, 1
@@ -114,46 +84,6 @@ class MadSBM(pl.LightningModule):
         xt = xt.masked_fill(masked, self.mask_id)
         return xt
-    # def compute_target(self, x1, xt, t, maskable_mask):
-    #     L = x1.size(1)
-    #     V = self.vocab_size
-    #     device = x1.device
-    #     mask = (xt == self.mask_id) & maskable_mask
-    #     b, l = torch.nonzero(mask, as_tuple=True)
-    #     if b.numel() == 0:
-    #         return b, l, torch.empty(0, device=device, dtype=torch.long)
-    #     log_R0 = - math.log(L * V)  # uniform generator with rates (1 / L*V)
-    #     time = - torch.log(1 - t[b])
-    #     v_target =  time - log_R0  # log(1/1-t) - log(1/L*V)
-    #     v_target = v_target.clamp(min=-100.0, max=100.0)
-    #     return b, l, v_target
-    # def compute_loss(self, u_theta, v_target, x1, b, l):
-    #     if b.numel() == 0:
-    #         dummy_loss = 0.0 * u_theta.sum()
-    #         return dummy_loss, torch.tensor(0.0, device=u_theta.device)
-    #     true_toks = x1[b, l]
-    #     u_pred = u_theta[b, l, :]  # N_masks, V
-    #     tgt = torch.zeros_like(u_pred)
-    #     tgt.scatter_(1, true_toks.unsqueeze(1), v_target.unsqueeze(1))
-    #     sse = F.mse_loss(u_pred, tgt, reduction='sum')
-    #     loss = sse / b.numel() if b.numel != 0 else sse  # normalize by number of masks
-    #     with torch.no_grad():
-    #         ppl = torch.exp(F.cross_entropy(u_pred, true_toks))
-    #     return loss, ppl
     # -------# Time Schedules #-------- #
     def sample_t(self, x1):

         for param in self.embed_model.parameters():
             param.requires_grad = False
         self.time_schedule = config.time_embed.time_schedule
         self.anneal_frac = config.time_embed.anneal_frac
         self.eps = float(config.time_embed.min_time)
         self.t_max = 1.0 - self.eps
+    # -------# Main Training Logic #-------- #
     def forward(self, input_ids, attention_mask, t):
         return self.model(xt=input_ids, attention_mask=attention_mask, t=t)
         loss = sample_loss.mean()
         ppl = torch.exp(loss)
         return loss, ppl, max_u_logit, max_esm_logit
     def noise_seq(self, x1, t, maskable_mask):
         B, L = x1.shape
         t = t.unsqueeze(1) # B, 1
         xt = xt.masked_fill(masked, self.mask_id)
         return xt
     # -------# Time Schedules #-------- #
     def sample_t(self, x1):