wnagleiofficial
commited on
Commit
·
38adcf4
1
Parent(s):
1cdc5d8
First model version
Browse files- NeuroPredPLM/__init__.py +0 -0
- NeuroPredPLM/args.pt +3 -0
- NeuroPredPLM/model.py +55 -0
- NeuroPredPLM/predict.py +20 -0
- NeuroPredPLM/utils.py +50 -0
- README.md +34 -0
- model.pth +3 -0
NeuroPredPLM/__init__.py
ADDED
|
File without changes
|
NeuroPredPLM/args.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51bbe01a0f9d64a23fc40c16fae8454188cb5ff6e1b661114490ef7e90718df1
|
| 3 |
+
size 4271
|
NeuroPredPLM/model.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
main model
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
from torch import nn
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
+
from einops import rearrange
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
from .utils import length_to_mask, load_model_and_alphabet_core
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class EsmModel(nn.Module):
|
| 15 |
+
def __init__(self, hidden_size=64, num_labels=2, projection_size=24, head=12):
|
| 16 |
+
super().__init__()
|
| 17 |
+
|
| 18 |
+
basedir = os.path.abspath(os.path.dirname(__file__))
|
| 19 |
+
self.esm, self.alphabet = load_model_and_alphabet_core(os.path.join(basedir, 'args.pt'))
|
| 20 |
+
self.num_labels = num_labels
|
| 21 |
+
self.head = head
|
| 22 |
+
self.hidden_size = hidden_size
|
| 23 |
+
self.projection = nn.Linear(hidden_size, projection_size)
|
| 24 |
+
self.cov_1 = nn.Conv1d(projection_size, projection_size, kernel_size=3, padding='same')
|
| 25 |
+
self.cov_2 = nn.Conv1d(projection_size, int(projection_size/2), kernel_size=1, padding='same')
|
| 26 |
+
# self.gating = nn.Linear(projection_size, projection_size)
|
| 27 |
+
self.W = nn.Parameter(torch.randn((head, int(projection_size/2))))
|
| 28 |
+
# self.mu = nn.Parameter(torch.randn((1, 768)))
|
| 29 |
+
self.fcn = nn.Sequential(nn.Linear(int(projection_size/2)*head, int(projection_size/2)),
|
| 30 |
+
nn.ReLU(), nn.Linear(int(projection_size/2), num_labels))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def forward(self, peptide_list, device='cpu'):
|
| 34 |
+
peptide_length = [len(i[1]) for i in peptide_list]
|
| 35 |
+
batch_converter = self.alphabet.get_batch_converter()
|
| 36 |
+
_, _, batch_tokens = batch_converter(peptide_list)
|
| 37 |
+
batch_tokens = batch_tokens.to(device)
|
| 38 |
+
protein_dict = self.esm(batch_tokens, repr_layers=[12], return_contacts=False)
|
| 39 |
+
protein_embeddings = protein_dict["representations"][12][:, 1:, :]
|
| 40 |
+
protein_embed = rearrange(protein_embeddings, 'b l (h d)-> (b h) l d', h=self.head)
|
| 41 |
+
representations = self.projection(protein_embed)
|
| 42 |
+
representations = rearrange(representations, 'b l d -> b d l')
|
| 43 |
+
representation_cov = F.relu(self.cov_1(representations))
|
| 44 |
+
representation_cov = F.relu(self.cov_2(representation_cov))
|
| 45 |
+
representations = rearrange(representation_cov, '(b h) d l -> b h l d', h=self.head)
|
| 46 |
+
att = torch.einsum('bhld,hd->bhl', representations, self.W)
|
| 47 |
+
mask = length_to_mask(torch.tensor(peptide_length)).to(device)
|
| 48 |
+
att = att.masked_fill(mask.unsqueeze(1)==0, -np.inf)
|
| 49 |
+
att= F.softmax(att, dim=-1)
|
| 50 |
+
# print(att)
|
| 51 |
+
representations = rearrange(representations * att.unsqueeze(-1), 'b h l d -> b l (h d)')
|
| 52 |
+
representations = torch.sum(representations, dim=1)
|
| 53 |
+
return self.fcn(representations), att
|
| 54 |
+
|
| 55 |
+
|
NeuroPredPLM/predict.py
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .model import EsmModel
|
| 2 |
+
from .utils import load_hub_workaround
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
MODEL_URL = "https://zenodo.org/record/7042286/files/model.pth"
|
| 6 |
+
|
| 7 |
+
def predict(peptide_list, device='cpu'):
|
| 8 |
+
with torch.no_grad():
|
| 9 |
+
neuroPred_model = EsmModel()
|
| 10 |
+
neuroPred_model.eval()
|
| 11 |
+
state_dict = load_hub_workaround(MODEL_URL)
|
| 12 |
+
# state_dict = torch.load("/mnt/d/protein-net/Neuropep-ESM/model.pth", map_location="cpu")
|
| 13 |
+
neuroPred_model.load_state_dict(state_dict)
|
| 14 |
+
neuroPred_model = neuroPred_model.to(device)
|
| 15 |
+
prob, att = neuroPred_model(peptide_list, device)
|
| 16 |
+
pred = torch.argmax(prob, dim=-1).cpu().tolist()
|
| 17 |
+
att = att.cpu().numpy()
|
| 18 |
+
out = {i[0]:[j,m[:, :len(i[1])]] for i, j, m in zip(peptide_list, pred, att)}
|
| 19 |
+
return out
|
| 20 |
+
|
NeuroPredPLM/utils.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import esm
|
| 3 |
+
from argparse import Namespace
|
| 4 |
+
import pathlib
|
| 5 |
+
import urllib
|
| 6 |
+
|
| 7 |
+
def length_to_mask(length, max_len=None, dtype=None):
|
| 8 |
+
"""length: B.
|
| 9 |
+
return B x max_len.
|
| 10 |
+
If max_len is None, then max of length will be used.
|
| 11 |
+
"""
|
| 12 |
+
assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
|
| 13 |
+
max_len = max_len or length.max().item()
|
| 14 |
+
mask = torch.arange(max_len, device=length.device,
|
| 15 |
+
dtype=length.dtype).expand(len(length), max_len) < length.unsqueeze(1)
|
| 16 |
+
if dtype is not None:
|
| 17 |
+
mask = torch.as_tensor(mask, dtype=dtype, device=length.device)
|
| 18 |
+
return mask
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_model_and_alphabet_core(args_dict, regression_data=None):
|
| 22 |
+
args_dict = torch.load(args_dict)
|
| 23 |
+
alphabet = esm.Alphabet.from_architecture(args_dict["args"].arch)
|
| 24 |
+
|
| 25 |
+
# upgrade state dict
|
| 26 |
+
pra = lambda s: "".join(s.split("decoder_")[1:] if "decoder" in s else s)
|
| 27 |
+
prs = lambda s: "".join(s.split("decoder.")[1:] if "decoder" in s else s)
|
| 28 |
+
model_args = {pra(arg[0]): arg[1] for arg in vars(args_dict["args"]).items()}
|
| 29 |
+
model_type = esm.ProteinBertModel
|
| 30 |
+
|
| 31 |
+
model = model_type(
|
| 32 |
+
Namespace(**model_args),
|
| 33 |
+
alphabet,
|
| 34 |
+
)
|
| 35 |
+
return model, alphabet
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def load_hub_workaround(url):
|
| 39 |
+
try:
|
| 40 |
+
data = torch.hub.load_state_dict_from_url(url, progress=False, map_location="cpu")
|
| 41 |
+
except RuntimeError:
|
| 42 |
+
# Pytorch version issue - see https://github.com/pytorch/pytorch/issues/43106
|
| 43 |
+
fn = pathlib.Path(url).name
|
| 44 |
+
data = torch.load(
|
| 45 |
+
f"{torch.hub.get_dir()}/checkpoints/{fn}",
|
| 46 |
+
map_location="cpu",
|
| 47 |
+
)
|
| 48 |
+
except urllib.error.HTTPError as e:
|
| 49 |
+
raise Exception(f"Could not load {url}, check your network!")
|
| 50 |
+
return data
|
README.md
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
## NeuroPred-PLM: an interpretable and robust model for prediction of neuropeptides by protein language model
|
| 2 |
+
[](https://pypi.org/project/NeuroPredPLM/) [](https://pypi.org/project/NeuroPredPLM/) [](./LICENSE) 
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
### Requirements
|
| 6 |
+
To install requirements:
|
| 7 |
+
|
| 8 |
+
```
|
| 9 |
+
# latest version
|
| 10 |
+
pip install git+https://github.com/ISYSLAB-HUST/NeuroPred-PLM.git
|
| 11 |
+
# stable version
|
| 12 |
+
pip install NeuroPredPLM
|
| 13 |
+
```
|
| 14 |
+
### Usage [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/ISYSLAB-HUST/NeuroPred-PLM/blob/master/notebook/NeuroPred_PLM_test.ipynb)
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
```
|
| 18 |
+
import torch
|
| 19 |
+
from NeuroPredPLM.predict import predict
|
| 20 |
+
data = [
|
| 21 |
+
("peptide_1", "IGLRLPNMLKF"),
|
| 22 |
+
("peptide_2", "QAAQFKVWSASELVD"),
|
| 23 |
+
("peptide_3","LRSPKMMHKSGCFGRRLDRIGSLSGLGCNVLRKY")
|
| 24 |
+
]
|
| 25 |
+
|
| 26 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 27 |
+
neuropeptide_pred = predict(data,device)
|
| 28 |
+
# {peptide_id:[Type:int(1->neuropeptide,0->non-neuropeptide), attention score:nd.array]}
|
| 29 |
+
```
|
| 30 |
+
### License
|
| 31 |
+
Released under the [MIT license](LICENSE).
|
| 32 |
+
|
| 33 |
+
### Contact
|
| 34 |
+
If you have any questions, comments, or would like to report a bug, please file a Github issue or contact me at wanglei94@hust.edu.cn.
|
model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:154841aade40ce25f75ee9028046b361001c90de1cd2c6fd09ead97de076de8a
|
| 3 |
+
size 340609839
|