ChatterjeeLab
/

moPPIt

Model card Files Files and versions

xet

Community

AlienChen commited on Mar 5

Commit

f52b5d3

verified ·

1 Parent(s): e2e01f3

Update classifier_code/half_life.py

Browse files

Files changed (1) hide show

classifier_code/half_life.py +205 -60

classifier_code/half_life.py CHANGED Viewed

@@ -1,65 +1,210 @@
 import numpy as np
 import torch
-import xgboost as xgb
-from transformers import EsmModel, EsmTokenizer
 import torch.nn as nn
-import pdb
-class PeptideCNN(nn.Module):
-    def __init__(self, input_dim, hidden_dims, output_dim, dropout_rate):
         super().__init__()
-        self.conv1 = nn.Conv1d(input_dim, hidden_dims[0], kernel_size=3, padding=1)
-        self.conv2 = nn.Conv1d(hidden_dims[0], hidden_dims[1], kernel_size=5, padding=1)
-        self.fc = nn.Linear(hidden_dims[1], output_dim)
-        self.dropout = nn.Dropout(dropout_rate)
-        self.predictor = nn.Linear(output_dim, 1)  # For regression/classification
-        self.esm_model = EsmModel.from_pretrained("facebook/esm2_t33_650M_UR50D").to(device)
-        self.esm_model.eval()
-    def forward(self, input_ids, attention_mask=None, return_features=False):
-        with torch.no_grad():
-            x = self.esm_model(input_ids, attention_mask).last_hidden_state
-        # pdb.set_trace()
-        # x shape: (B, L, input_dim)
-        x = x.permute(0, 2, 1)  # Reshape to (B, input_dim, L) for Conv1d
-        x = nn.functional.relu(self.conv1(x))
-        x = self.dropout(x)
-        x = nn.functional.relu(self.conv2(x))
-        x = self.dropout(x)
-        x = x.permute(0, 2, 1)  # Reshape back to (B, L, hidden_dims[1])
-        # Global average pooling over the sequence dimension (L)
-        x = x.mean(dim=1)  # Shape: (B, hidden_dims[1])
-        features = self.fc(x)  # features shape: (B, output_dim)
-        if return_features:
-            return features
-        return self.predictor(features)  # Output shape: (B, 1)
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-input_dim = 1280
-hidden_dims = [input_dim // 2, input_dim // 4]
-output_dim = input_dim // 8
-dropout_rate = 0.3
-nn_model = PeptideCNN(input_dim, hidden_dims, output_dim, dropout_rate).to(device)
-nn_model.load_state_dict(torch.load('/scratch/pranamlab/tong/checkpoints/MOG-DFM/classifier_ckpt/best_model_half_life.pth'))
-nn_model.eval()
-def predict(inputs):
-    with torch.no_grad():
-        prediction = nn_model(**inputs, return_features=False)
-    return prediction.item()
-if __name__ == '__main__':
-    sequence = 'RGLSDGFLKLKMGISGSLGC'
-    tokenizer = EsmTokenizer.from_pretrained("facebook/esm2_t33_650M_UR50D")
-    inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)
-    prediction = predict(inputs)
-    print(prediction)
-    print(f"Predicted half life of {sequence} is {(10**prediction):.4f} h")

+import os
+from typing import List, Optional, Union
 import numpy as np
 import torch
 import torch.nn as nn
+from transformers import EsmModel, AutoTokenizer
+# -----------------------------
+# Model definition (must match training)
+# -----------------------------
+class TransformerRegressor(nn.Module):
+    def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
         super().__init__()
+        self.proj = nn.Linear(in_dim, d_model)
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model,
+            nhead=nhead,
+            dim_feedforward=ff,
+            dropout=dropout,
+            batch_first=True,
+            activation="gelu",
+        )
+        self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
+        self.head = nn.Linear(d_model, 1)
+    def forward(self, X, M):
+        # M: True = keep token, False = padding
+        pad_mask = ~M
+        Z = self.proj(X)
+        Z = self.enc(Z, src_key_padding_mask=pad_mask)
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        pooled = (Z * Mf).sum(dim=1) / denom
+        return self.head(pooled).squeeze(-1)
+def build_model(model_name: str, in_dim: int, params: dict) -> nn.Module:
+    if model_name != "transformer":
+        raise ValueError(f"This inference file currently supports model_name='transformer', got: {model_name}")
+    return TransformerRegressor(
+        in_dim=in_dim,
+        d_model=384,
+        nhead=4,
+        layers=1,
+        ff=512,
+        dropout=0.1521676463658988,
+    )
+def _clean_state_dict(state_dict: dict) -> dict:
+    cleaned = {}
+    for k, v in state_dict.items():
+        if k.startswith("module."):
+            k = k[len("module.") :]
+        if k.startswith("model."):
+            k = k[len("model.") :]
+        cleaned[k] = v
+    return cleaned
+# -----------------------------
+# Predictor
+# -----------------------------
+class HalflifeTransformer:
+    def __init__(
+        self,
+        ckpt_path: str = "/scratch/pranamlab/tong/PeptiVerse/src/halflife/FINETUNED_TRANSFORMER_DIR/final_model.pt",
+        esm_name: str = "facebook/esm2_t33_650M_UR50D",
+        device: Optional[str] = None,
+        model_name: str = "transformer",
+    ):
+        self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu"))
+        ckpt = torch.load(ckpt_path, map_location="cpu")
+        if not isinstance(ckpt, dict) or "state_dict" not in ckpt:
+            raise ValueError(f"Checkpoint at {ckpt_path} is not the expected dict with a 'state_dict' key.")
+        self.best_params = ckpt.get("best_params", {})
+        self.in_dim = int(ckpt.get("in_dim"))
+        self.target_col = ckpt.get("target_col", "label")  # 'log_label' or 'label'
+        self.model_name = model_name
+        # --- build + load regressor ---
+        self.regressor = build_model(model_name=self.model_name, in_dim=self.in_dim, params=self.best_params)
+        self.regressor.load_state_dict(_clean_state_dict(ckpt["state_dict"]), strict=True)
+        self.regressor.to(self.device)
+        self.regressor.eval()
+        # --- ESM2 embedding model ---
+        self.emb_model = EsmModel.from_pretrained(esm_name).to(self.device)
+        self.emb_model.eval()
+        self.tokenizer = AutoTokenizer.from_pretrained(esm_name)
+        # sanity: ESM2 hidden size should match training in_dim
+        esm_hidden = int(self.emb_model.config.hidden_size)
+        if esm_hidden != self.in_dim:
+            raise ValueError(
+                f"Mismatch: ESM hidden_size={esm_hidden}, but checkpoint in_dim={self.in_dim}.\n"
+                f"Did you train on a different embedding model/dimension than {esm_name}?"
+            )
+    @torch.no_grad()
+    def _embed_unpooled_batch(
+        self,
+        sequences: List[str],
+        max_length: int = 1024,
+    ):
+        """
+        Returns:
+          X: (B, Lmax, H) float32
+          M: (B, Lmax) bool, True for real residues, False for padding
+        """
+        if len(sequences) == 0:
+            X = torch.zeros((0, 1, self.in_dim), dtype=torch.float32, device=self.device)
+            M = torch.zeros((0, 1), dtype=torch.bool, device=self.device)
+            return X, M
+        toks = self.tokenizer(
+            sequences,
+            return_tensors="pt",
+            padding=True,
+            truncation=True,
+            max_length=max_length,
+            add_special_tokens=True,
+        )
+        toks = {k: v.to(self.device) for k, v in toks.items()}
+        out = self.emb_model(**toks)
+        hs = out.last_hidden_state  # (B, T, H)
+        attn = toks["attention_mask"].bool()  # (B, T)
+        per_seq = []
+        lengths = []
+        for i in range(hs.shape[0]):
+            valid_idx = torch.nonzero(attn[i], as_tuple=False).squeeze(-1)
+            # ESM typically has <cls> ... tokens ... <eos> among valid positions
+            if valid_idx.numel() <= 2:
+                emb = hs.new_zeros((0, hs.shape[-1]))
+            else:
+                core_idx = valid_idx[1:-1]  # drop CLS and EOS
+                emb = hs[i, core_idx, :]    # (L, H)
+            per_seq.append(emb)
+            lengths.append(int(emb.shape[0]))
+        Lmax = max(lengths) if lengths else 0
+        H = hs.shape[-1]
+        X = hs.new_zeros((len(sequences), Lmax, H), dtype=torch.float32)
+        M = torch.zeros((len(sequences), Lmax), dtype=torch.bool, device=self.device)
+        for i, emb in enumerate(per_seq):
+            L = emb.shape[0]
+            if L == 0:
+                continue
+            X[i, :L, :] = emb.to(torch.float32)
+            M[i, :L] = True
+        return X, M
+    @torch.no_grad()
+    def predict_raw(
+        self,
+        input_seqs: List[str],
+        batch_size: int = 16,
+    ) -> np.ndarray:
+        """
+        Returns the regressor output in the same space as training target_col:
+          - if trained on log_label -> returns log1p(hours)
+          - if trained on label     -> returns hours (or whatever label scale was)
+        """
+        if len(input_seqs) == 0:
+            return np.array([], dtype=np.float32)
+        preds = []
+        for i in range(0, len(input_seqs), batch_size):
+            batch = input_seqs[i : i + batch_size]
+            X, M = self._embed_unpooled_batch(batch)
+            yhat = self.regressor(X, M)  # (B,)
+            preds.append(yhat.detach().cpu().numpy().astype(np.float32))
+        return np.concatenate(preds, axis=0)
+    def predict_hours(self, input_seqs: List[str], batch_size: int = 16) -> np.ndarray:
+        """
+        If your model was trained on log_label, convert back to hours via expm1.
+        Otherwise returns raw predictions.
+        """
+        raw = self.predict_raw(input_seqs, batch_size=batch_size)
+        if self.target_col == "log_label":
+            return np.expm1(raw).astype(np.float32)
+        return raw.astype(np.float32)
+    def __call__(self, input_seqs: List[str], batch_size: int = 16) -> np.ndarray:
+        return self.predict_hours(input_seqs, batch_size=batch_size)
+def unittest():
+    ckpt_path = "../classifier_ckpt/wt_halflife.pt"
+    halflife = HalflifeTransformer(ckpt_path=ckpt_path)
+    seqs = ["MWQRPSSWIEGRFPHSDAVFTDQYTRLRKQLAAKKYLQSLKQKRY"]
+    pred = halflife(seqs)
+    print("pred_hours:", pred)
+if __name__ == "__main__":
+    unittest()