| import os |
| from typing import List, Optional, Union |
|
|
| import numpy as np |
| import torch |
| import torch.nn as nn |
| from transformers import EsmModel, AutoTokenizer |
|
|
|
|
| |
| |
| |
| class TransformerRegressor(nn.Module): |
| def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1): |
| super().__init__() |
| self.proj = nn.Linear(in_dim, d_model) |
| enc_layer = nn.TransformerEncoderLayer( |
| d_model=d_model, |
| nhead=nhead, |
| dim_feedforward=ff, |
| dropout=dropout, |
| batch_first=True, |
| activation="gelu", |
| ) |
| self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers) |
| self.head = nn.Linear(d_model, 1) |
|
|
| def forward(self, X, M): |
| |
| pad_mask = ~M |
| Z = self.proj(X) |
| Z = self.enc(Z, src_key_padding_mask=pad_mask) |
| Mf = M.unsqueeze(-1).float() |
| denom = Mf.sum(dim=1).clamp(min=1.0) |
| pooled = (Z * Mf).sum(dim=1) / denom |
| return self.head(pooled).squeeze(-1) |
|
|
|
|
| def build_model(model_name: str, in_dim: int, params: dict) -> nn.Module: |
| if model_name != "transformer": |
| raise ValueError(f"This inference file currently supports model_name='transformer', got: {model_name}") |
| return TransformerRegressor( |
| in_dim=in_dim, |
| d_model=384, |
| nhead=4, |
| layers=1, |
| ff=512, |
| dropout=0.1521676463658988, |
| ) |
|
|
|
|
| def _clean_state_dict(state_dict: dict) -> dict: |
| cleaned = {} |
| for k, v in state_dict.items(): |
| if k.startswith("module."): |
| k = k[len("module.") :] |
| if k.startswith("model."): |
| k = k[len("model.") :] |
| cleaned[k] = v |
| return cleaned |
|
|
|
|
| |
| |
| |
| class HalflifeTransformer: |
|
|
| def __init__( |
| self, |
| ckpt_path: str = "/scratch/pranamlab/tong/PeptiVerse/src/halflife/FINETUNED_TRANSFORMER_DIR/final_model.pt", |
| esm_name: str = "facebook/esm2_t33_650M_UR50D", |
| device: Optional[str] = None, |
| model_name: str = "transformer", |
| ): |
| self.device = torch.device(device or ("cuda" if torch.cuda.is_available() else "cpu")) |
|
|
| ckpt = torch.load(ckpt_path, map_location="cpu") |
| if not isinstance(ckpt, dict) or "state_dict" not in ckpt: |
| raise ValueError(f"Checkpoint at {ckpt_path} is not the expected dict with a 'state_dict' key.") |
|
|
| self.best_params = ckpt.get("best_params", {}) |
| self.in_dim = int(ckpt.get("in_dim")) |
| self.target_col = ckpt.get("target_col", "label") |
| self.model_name = model_name |
|
|
| |
| self.regressor = build_model(model_name=self.model_name, in_dim=self.in_dim, params=self.best_params) |
| self.regressor.load_state_dict(_clean_state_dict(ckpt["state_dict"]), strict=True) |
| self.regressor.to(self.device) |
| self.regressor.eval() |
|
|
| |
| self.emb_model = EsmModel.from_pretrained(esm_name).to(self.device) |
| self.emb_model.eval() |
| self.tokenizer = AutoTokenizer.from_pretrained(esm_name) |
|
|
| |
| esm_hidden = int(self.emb_model.config.hidden_size) |
| if esm_hidden != self.in_dim: |
| raise ValueError( |
| f"Mismatch: ESM hidden_size={esm_hidden}, but checkpoint in_dim={self.in_dim}.\n" |
| f"Did you train on a different embedding model/dimension than {esm_name}?" |
| ) |
|
|
| @torch.no_grad() |
| def _embed_unpooled_batch( |
| self, |
| sequences: List[str], |
| max_length: int = 1024, |
| ): |
| """ |
| Returns: |
| X: (B, Lmax, H) float32 |
| M: (B, Lmax) bool, True for real residues, False for padding |
| """ |
| if len(sequences) == 0: |
| X = torch.zeros((0, 1, self.in_dim), dtype=torch.float32, device=self.device) |
| M = torch.zeros((0, 1), dtype=torch.bool, device=self.device) |
| return X, M |
|
|
| toks = self.tokenizer( |
| sequences, |
| return_tensors="pt", |
| padding=True, |
| truncation=True, |
| max_length=max_length, |
| add_special_tokens=True, |
| ) |
| toks = {k: v.to(self.device) for k, v in toks.items()} |
|
|
| out = self.emb_model(**toks) |
| hs = out.last_hidden_state |
| attn = toks["attention_mask"].bool() |
|
|
| per_seq = [] |
| lengths = [] |
|
|
| for i in range(hs.shape[0]): |
| valid_idx = torch.nonzero(attn[i], as_tuple=False).squeeze(-1) |
| |
| if valid_idx.numel() <= 2: |
| emb = hs.new_zeros((0, hs.shape[-1])) |
| else: |
| core_idx = valid_idx[1:-1] |
| emb = hs[i, core_idx, :] |
| per_seq.append(emb) |
| lengths.append(int(emb.shape[0])) |
|
|
| Lmax = max(lengths) if lengths else 0 |
| H = hs.shape[-1] |
| X = hs.new_zeros((len(sequences), Lmax, H), dtype=torch.float32) |
| M = torch.zeros((len(sequences), Lmax), dtype=torch.bool, device=self.device) |
|
|
| for i, emb in enumerate(per_seq): |
| L = emb.shape[0] |
| if L == 0: |
| continue |
| X[i, :L, :] = emb.to(torch.float32) |
| M[i, :L] = True |
|
|
| return X, M |
|
|
| @torch.no_grad() |
| def predict_raw( |
| self, |
| input_seqs: List[str], |
| batch_size: int = 16, |
| ) -> np.ndarray: |
| """ |
| Returns the regressor output in the same space as training target_col: |
| - if trained on log_label -> returns log1p(hours) |
| - if trained on label -> returns hours (or whatever label scale was) |
| """ |
| if len(input_seqs) == 0: |
| return np.array([], dtype=np.float32) |
|
|
| preds = [] |
| for i in range(0, len(input_seqs), batch_size): |
| batch = input_seqs[i : i + batch_size] |
| X, M = self._embed_unpooled_batch(batch) |
| yhat = self.regressor(X, M) |
| preds.append(yhat.detach().cpu().numpy().astype(np.float32)) |
|
|
| return np.concatenate(preds, axis=0) |
|
|
| def predict_hours(self, input_seqs: List[str], batch_size: int = 16) -> np.ndarray: |
| """ |
| If your model was trained on log_label, convert back to hours via expm1. |
| Otherwise returns raw predictions. |
| """ |
| raw = self.predict_raw(input_seqs, batch_size=batch_size) |
| if self.target_col == "log_label": |
| return np.expm1(raw).astype(np.float32) |
| return raw.astype(np.float32) |
|
|
| def __call__(self, input_seqs: List[str], batch_size: int = 16) -> np.ndarray: |
| return self.predict_hours(input_seqs, batch_size=batch_size) |
|
|
|
|
| def unittest(): |
| ckpt_path = "../classifier_ckpt/wt_halflife.pt" |
|
|
| halflife = HalflifeTransformer(ckpt_path=ckpt_path) |
| seqs = ["MWQRPSSWIEGRFPHSDAVFTDQYTRLRKQLAAKKYLQSLKQKRY"] |
| pred = halflife(seqs) |
| print("pred_hours:", pred) |
|
|
|
|
| if __name__ == "__main__": |
| unittest() |