ChatterjeeLab
/

moPPIt

Model card Files Files and versions

xet

Community

AlienChen commited on Mar 5

Commit

f122106

verified ·

1 Parent(s): fe763fa

Update models/peptide_classifiers.py

Browse files

Files changed (1) hide show

models/peptide_classifiers.py +210 -43

models/peptide_classifiers.py CHANGED Viewed

@@ -509,7 +509,7 @@ class AffinityModel(nn.Module):
 class HemolysisModel:
     def __init__(self, device):
-        self.predictor = xgb.Booster(model_file='./classifier_ckpt/best_model_hemolysis.json')
         self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
         self.model.eval()
@@ -544,47 +544,58 @@ class HemolysisModel:
         scores = self.get_scores(input_seqs)
         return scores
 class NonfoulingModel:
     def __init__(self, device):
-        # change model path
-        self.predictor = xgb.Booster(model_file='./classifier_ckpt/best_model_nonfouling.json')
         self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
         self.model.eval()
         self.device = device
-    def generate_embeddings(self, sequences):
-        """Generate ESM embeddings for protein sequences"""
         with torch.no_grad():
-            embeddings = self.model(input_ids=sequences).last_hidden_state.mean(dim=1)
-            embeddings = embeddings.cpu().numpy()
-        return embeddings
-    def get_scores(self, input_seqs):
-        scores = np.zeros(len(input_seqs))
-        features = self.generate_embeddings(input_seqs)
-        if len(features) == 0:
-            return scores
-        features = np.nan_to_num(features, nan=0.)
-        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
-        features = xgb.DMatrix(features)
-        scores = self.predictor.predict(features)
-        return torch.from_numpy(scores).to(self.device)
-    def __call__(self, input_seqs: list):
-        scores = self.get_scores(input_seqs)
-        return scores
 class SolubilityModel:
     def __init__(self, device):
         # change model path
-        self.predictor = xgb.Booster(model_file='./classifier_ckpt/best_model_solubility.json')
         self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
         self.model.eval()
@@ -624,7 +635,8 @@ class SolubilityModelNew:
         self.device = device
     def get_scores(self, x):
-        mask = (x.unsqueeze(-1) == self.hydro_ids).any(dim=-1)
         ratios = mask.float().mean(dim=1)
         return 1 - ratios
@@ -663,24 +675,179 @@ class PeptideCNN(nn.Module):
             return features
         return self.predictor(features)  # Output shape: (B, 1)
 class HalfLifeModel:
-    def __init__(self, device):
-        input_dim = 1280
-        hidden_dims = [input_dim // 2, input_dim // 4]
-        output_dim = input_dim // 8
-        dropout_rate = 0.3
-        self.model = PeptideCNN(input_dim, hidden_dims, output_dim, dropout_rate).to(device)
-        self.model.load_state_dict(torch.load('./classifier_ckpt/best_model_half_life.pth', map_location=device, weights_only=False))
-        self.model.eval()
-    def __call__(self, x):
-        prediction = self.model(x, return_features=False)
-        halflife = torch.clamp(prediction.squeeze(-1), max=2.0, min=0.0)
-        return halflife / 2
 def load_bindevaluator(checkpoint_path, device):
-    bindevaluator = BindEvaluator.load_from_checkpoint(checkpoint_path, weights_only=False, n_layers=8, d_model=128, d_hidden=128, n_head=8, d_k=64, d_v=128, d_inner=64).to(device)
     bindevaluator.eval()
     for param in bindevaluator.parameters():
         param.requires_grad = False

 class HemolysisModel:
     def __init__(self, device):
+        self.predictor = xgb.Booster(model_file='/scratch/pranamlab/tong/collection/classifiers/ckpt/wt_hemolysis.json')
         self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
         self.model.eval()
         scores = self.get_scores(input_seqs)
         return scores
+# ======================== MLP =========================================
+# Still need mean pooling along lengths
+class MaskedMeanPool(nn.Module):
+    def forward(self, X, M):  # X: (B,L,H), M: (B,L)
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        return (X * Mf).sum(dim=1) / denom  # (B,H)
+class MLPClassifier(nn.Module):
+    def __init__(self, in_dim, hidden=512, dropout=0.1):
+        super().__init__()
+        self.pool = MaskedMeanPool()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden, 1),
+        )
+    def forward(self, X, M):
+        z = self.pool(X, M)
+        return self.net(z).squeeze(-1)  # logits
+# ======================== MLP =========================================
 class NonfoulingModel:
     def __init__(self, device):
+        ckpt = torch.load('/scratch/pranamlab/tong/collection/classifiers/ckpt/wt_nonfouling.pt', weights_only=False, map_location=device)
+        best_params = ckpt["best_params"]
+        self.predictor = MLPClassifier(in_dim=1280, hidden=int(best_params["hidden"]), dropout=float(best_params.get("dropout", 0.1)))
+        self.predictor.load_state_dict(ckpt["state_dict"])
+        self.predictor = self.predictor.to(device)
+        self.predictor.eval()
         self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
         self.model.eval()
         self.device = device
+    def get_scores(self, input_ids, attention_mask):
         with torch.no_grad():
+            features = self.model(input_ids=input_ids, attention_mask=attention_mask).last_hidden_state
+        scores = self.predictor(features, attention_mask)
+        return scores
+    def __call__(self, input_ids):
+        attention_mask = torch.ones_like(input_ids).to(self.device)
+        scores = self.get_scores(input_ids, attention_mask)
+        return 1.0 / (1.0 + torch.exp(-scores))
 class SolubilityModel:
     def __init__(self, device):
         # change model path
+        self.predictor = xgb.Booster(model_file='/scratch/pranamlab/tong/checkpoints/MOG-DFM/classifier_ckpt/best_model_solubility.json')
         self.model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
         self.model.eval()
         self.device = device
     def get_scores(self, x):
+        a = x[:, 1:-1]
+        mask = (a.unsqueeze(-1) == self.hydro_ids).any(dim=-1)
         ratios = mask.float().mean(dim=1)
         return 1 - ratios
             return features
         return self.predictor(features)  # Output shape: (B, 1)
+# class HalfLifeModel:
+#     def __init__(self, device):
+#         input_dim = 1280
+#         hidden_dims = [input_dim // 2, input_dim // 4]
+#         output_dim = input_dim // 8
+#         dropout_rate = 0.3
+#         self.model = PeptideCNN(input_dim, hidden_dims, output_dim, dropout_rate).to(device)
+#         self.model.load_state_dict(torch.load('/scratch/pranamlab/tong/checkpoints/MOG-DFM/classifier_ckpt/best_model_half_life.pth', map_location=device, weights_only=False))
+#         self.model.eval()
+#     def __call__(self, x):
+#         prediction = self.model(x, return_features=False)
+#         halflife = torch.clamp(prediction.squeeze(-1), max=2.0, min=0.0)
+#         return halflife / 2
+# -----------------------------
+# Model definition (must match training)
+# -----------------------------
+class TransformerRegressor(nn.Module):
+    def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
+        super().__init__()
+        self.proj = nn.Linear(in_dim, d_model)
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=ff,
+            dropout=dropout,
+            batch_first=True,
+            activation="gelu",
+        )
+        self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
+        self.head = nn.Linear(d_model, 1)
+    def forward(self, X, M):
+        # M: True = keep token, False = padding
+        pad_mask = ~M
+        Z = self.proj(X)
+        Z = self.enc(Z, src_key_padding_mask=pad_mask)
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        pooled = (Z * Mf).sum(dim=1) / denom
+        return self.head(pooled).squeeze(-1)
+def build_model(model_name: str, in_dim: int, params: dict) -> nn.Module:
+    # In your training code, transformer uses fixed architecture values (d_model/nhead/layers/ff/dropout).
+    # (See build_model in finetune_nn_cv.py :contentReference[oaicite:2]{index=2})
+    if model_name != "transformer":
+        raise ValueError(f"This inference file currently supports model_name='transformer', got: {model_name}")
+    return TransformerRegressor(
+        in_dim=in_dim,
+        d_model=384,
+        nhead=4,
+        layers=1,
+        ff=512,
+        dropout=0.1521676463658988,
+    )
+def _clean_state_dict(state_dict: dict) -> dict:
+    cleaned = {}
+    for k, v in state_dict.items():
+        if k.startswith("module."):
+            k = k[len("module.") :]
+        if k.startswith("model."):
+            k = k[len("model.") :]
+        cleaned[k] = v
+    return cleaned
 class HalfLifeModel:
+    """
+    Loads:
+      - ESM2 encoder to generate *unpooled* token embeddings (per residue)
+      - Your fine-tuned TransformerRegressor from final_model.pt
+    By default, __call__ returns "hours":
+      - if ckpt['target_col'] == 'log_label' -> expm1(pred)
+      - else -> raw pred
+    """
+    def __init__(
+        self,
+        device,
+        ckpt_path = "/scratch/pranamlab/tong/PeptiVerse/src/halflife/finetune_stability_transformer_log/final_model.pt",
+    ):
+        self.device = device
+        # --- load NN checkpoint (saved by your finetune script) ---
+        ckpt = torch.load(ckpt_path, map_location="cpu")
+        if not isinstance(ckpt, dict) or "state_dict" not in ckpt:
+            raise ValueError(f"Checkpoint at {ckpt_path} is not the expected dict with a 'state_dict' key.")
+        self.best_params = ckpt.get("best_params", {})
+        self.in_dim = int(ckpt.get("in_dim"))
+        self.target_col = ckpt.get("target_col", "label")  # 'log_label' or 'label'
+        # --- build + load regressor ---
+        self.regressor = build_model(model_name="transformer", in_dim=self.in_dim, params=self.best_params)
+        self.regressor.load_state_dict(_clean_state_dict(ckpt["state_dict"]), strict=True)
+        self.regressor.to(self.device)
+        self.regressor.eval()
+        # --- ESM2 embedding model ---
+        self.emb_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(self.device)
+        self.emb_model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
+        # sanity: ESM2 hidden size should match training in_dim
+        esm_hidden = int(self.emb_model.config.hidden_size)
+        if esm_hidden != self.in_dim:
+            raise ValueError(
+                f"Mismatch: ESM hidden_size={esm_hidden}, but checkpoint in_dim={self.in_dim}.\n"
+                f"Did you train on a different embedding model/dimension than facebook/esm2_t33_650M_UR50D?"
+            )
+    @torch.no_grad()
+    def _embed_unpooled_batch(self, sequences):
+        out = self.emb_model(input_ids=sequences)
+        hs = out.last_hidden_state  # (B, T, H)
+        per_seq = []
+        lengths = []
+        for i in range(hs.shape[0]):
+            emb = hs[i, 1:-1, :]    # (L, H)
+            per_seq.append(emb)
+            lengths.append(int(emb.shape[0]))
+        Lmax = max(lengths) if lengths else 0
+        H = hs.shape[-1]
+        X = hs.new_zeros((len(sequences), Lmax, H), dtype=torch.float32)
+        M = torch.zeros((len(sequences), Lmax), dtype=torch.bool, device=self.device)
+        for i, emb in enumerate(per_seq):
+            L = emb.shape[0]
+            if L == 0:
+                continue
+            X[i, :L, :] = emb.to(torch.float32)
+            M[i, :L] = True
+        return X, M
+    @torch.no_grad()
+    def predict_raw(self, input_seqs):
+        """
+        Returns the regressor output in the same space as training target_col:
+          - if trained on log_label -> returns log1p(hours)
+          - if trained on label     -> returns hours (or whatever label scale was)
+        """
+        if len(input_seqs) == 0:
+            return np.array([], dtype=np.float32)
+        X, M = self._embed_unpooled_batch(input_seqs)
+        yhat = self.regressor(X, M).detach().cpu().numpy().astype(np.float32)  # (B,)
+        # pdb.set_trace()
+        return yhat
+    def predict_hours(self, input_seqs) -> np.ndarray:
+        """
+        If your model was trained on log_label, convert back to hours via expm1.
+        Otherwise returns raw predictions.
+        """
+        raw = self.predict_raw(input_seqs)
+        if self.target_col == "log_label":
+            return np.expm1(raw).astype(np.float32)
+        return raw.astype(np.float32)
+    def __call__(self, input_seqs) -> np.ndarray:
+        return torch.from_numpy(self.predict_hours(input_seqs)).to(self.device)
 def load_bindevaluator(checkpoint_path, device):
+    bindevaluator = BindEvaluator.load_from_checkpoint(checkpoint_path, n_layers=8, d_model=128, d_hidden=128, n_head=8, d_k=64, d_v=128, d_inner=64).to(device)
     bindevaluator.eval()
     for param in bindevaluator.parameters():
         param.requires_grad = False