wnagleiofficial commited on
Commit
38adcf4
·
1 Parent(s): 1cdc5d8

First model version

Browse files
NeuroPredPLM/__init__.py ADDED
File without changes
NeuroPredPLM/args.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51bbe01a0f9d64a23fc40c16fae8454188cb5ff6e1b661114490ef7e90718df1
3
+ size 4271
NeuroPredPLM/model.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ main model
3
+ """
4
+ import torch
5
+ from torch import nn
6
+ import numpy as np
7
+ import torch.nn.functional as F
8
+ from einops import rearrange
9
+ import os
10
+
11
+ from .utils import length_to_mask, load_model_and_alphabet_core
12
+
13
+
14
+ class EsmModel(nn.Module):
15
+ def __init__(self, hidden_size=64, num_labels=2, projection_size=24, head=12):
16
+ super().__init__()
17
+
18
+ basedir = os.path.abspath(os.path.dirname(__file__))
19
+ self.esm, self.alphabet = load_model_and_alphabet_core(os.path.join(basedir, 'args.pt'))
20
+ self.num_labels = num_labels
21
+ self.head = head
22
+ self.hidden_size = hidden_size
23
+ self.projection = nn.Linear(hidden_size, projection_size)
24
+ self.cov_1 = nn.Conv1d(projection_size, projection_size, kernel_size=3, padding='same')
25
+ self.cov_2 = nn.Conv1d(projection_size, int(projection_size/2), kernel_size=1, padding='same')
26
+ # self.gating = nn.Linear(projection_size, projection_size)
27
+ self.W = nn.Parameter(torch.randn((head, int(projection_size/2))))
28
+ # self.mu = nn.Parameter(torch.randn((1, 768)))
29
+ self.fcn = nn.Sequential(nn.Linear(int(projection_size/2)*head, int(projection_size/2)),
30
+ nn.ReLU(), nn.Linear(int(projection_size/2), num_labels))
31
+
32
+
33
+ def forward(self, peptide_list, device='cpu'):
34
+ peptide_length = [len(i[1]) for i in peptide_list]
35
+ batch_converter = self.alphabet.get_batch_converter()
36
+ _, _, batch_tokens = batch_converter(peptide_list)
37
+ batch_tokens = batch_tokens.to(device)
38
+ protein_dict = self.esm(batch_tokens, repr_layers=[12], return_contacts=False)
39
+ protein_embeddings = protein_dict["representations"][12][:, 1:, :]
40
+ protein_embed = rearrange(protein_embeddings, 'b l (h d)-> (b h) l d', h=self.head)
41
+ representations = self.projection(protein_embed)
42
+ representations = rearrange(representations, 'b l d -> b d l')
43
+ representation_cov = F.relu(self.cov_1(representations))
44
+ representation_cov = F.relu(self.cov_2(representation_cov))
45
+ representations = rearrange(representation_cov, '(b h) d l -> b h l d', h=self.head)
46
+ att = torch.einsum('bhld,hd->bhl', representations, self.W)
47
+ mask = length_to_mask(torch.tensor(peptide_length)).to(device)
48
+ att = att.masked_fill(mask.unsqueeze(1)==0, -np.inf)
49
+ att= F.softmax(att, dim=-1)
50
+ # print(att)
51
+ representations = rearrange(representations * att.unsqueeze(-1), 'b h l d -> b l (h d)')
52
+ representations = torch.sum(representations, dim=1)
53
+ return self.fcn(representations), att
54
+
55
+
NeuroPredPLM/predict.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import EsmModel
2
+ from .utils import load_hub_workaround
3
+ import torch
4
+
5
+ MODEL_URL = "https://zenodo.org/record/7042286/files/model.pth"
6
+
7
+ def predict(peptide_list, device='cpu'):
8
+ with torch.no_grad():
9
+ neuroPred_model = EsmModel()
10
+ neuroPred_model.eval()
11
+ state_dict = load_hub_workaround(MODEL_URL)
12
+ # state_dict = torch.load("/mnt/d/protein-net/Neuropep-ESM/model.pth", map_location="cpu")
13
+ neuroPred_model.load_state_dict(state_dict)
14
+ neuroPred_model = neuroPred_model.to(device)
15
+ prob, att = neuroPred_model(peptide_list, device)
16
+ pred = torch.argmax(prob, dim=-1).cpu().tolist()
17
+ att = att.cpu().numpy()
18
+ out = {i[0]:[j,m[:, :len(i[1])]] for i, j, m in zip(peptide_list, pred, att)}
19
+ return out
20
+
NeuroPredPLM/utils.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import esm
3
+ from argparse import Namespace
4
+ import pathlib
5
+ import urllib
6
+
7
+ def length_to_mask(length, max_len=None, dtype=None):
8
+ """length: B.
9
+ return B x max_len.
10
+ If max_len is None, then max of length will be used.
11
+ """
12
+ assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
13
+ max_len = max_len or length.max().item()
14
+ mask = torch.arange(max_len, device=length.device,
15
+ dtype=length.dtype).expand(len(length), max_len) < length.unsqueeze(1)
16
+ if dtype is not None:
17
+ mask = torch.as_tensor(mask, dtype=dtype, device=length.device)
18
+ return mask
19
+
20
+
21
+ def load_model_and_alphabet_core(args_dict, regression_data=None):
22
+ args_dict = torch.load(args_dict)
23
+ alphabet = esm.Alphabet.from_architecture(args_dict["args"].arch)
24
+
25
+ # upgrade state dict
26
+ pra = lambda s: "".join(s.split("decoder_")[1:] if "decoder" in s else s)
27
+ prs = lambda s: "".join(s.split("decoder.")[1:] if "decoder" in s else s)
28
+ model_args = {pra(arg[0]): arg[1] for arg in vars(args_dict["args"]).items()}
29
+ model_type = esm.ProteinBertModel
30
+
31
+ model = model_type(
32
+ Namespace(**model_args),
33
+ alphabet,
34
+ )
35
+ return model, alphabet
36
+
37
+
38
+ def load_hub_workaround(url):
39
+ try:
40
+ data = torch.hub.load_state_dict_from_url(url, progress=False, map_location="cpu")
41
+ except RuntimeError:
42
+ # Pytorch version issue - see https://github.com/pytorch/pytorch/issues/43106
43
+ fn = pathlib.Path(url).name
44
+ data = torch.load(
45
+ f"{torch.hub.get_dir()}/checkpoints/{fn}",
46
+ map_location="cpu",
47
+ )
48
+ except urllib.error.HTTPError as e:
49
+ raise Exception(f"Could not load {url}, check your network!")
50
+ return data
README.md ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## NeuroPred-PLM: an interpretable and robust model for prediction of neuropeptides by protein language model
2
+ [![PyPI - Version](https://img.shields.io/pypi/v/NeuroPredPLM.svg?style=flat)](https://pypi.org/project/NeuroPredPLM/) [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/NeuroPredPLM.svg)](https://pypi.org/project/NeuroPredPLM/) [![GitHub - LICENSE](https://img.shields.io/github/license/isyslab-hust/NeuroPred-PLM.svg?style=flat)](./LICENSE) ![PyPI - Downloads](https://img.shields.io/pypi/dm/NeuroPredPLM)
3
+
4
+
5
+ ### Requirements
6
+ To install requirements:
7
+
8
+ ```
9
+ # latest version
10
+ pip install git+https://github.com/ISYSLAB-HUST/NeuroPred-PLM.git
11
+ # stable version
12
+ pip install NeuroPredPLM
13
+ ```
14
+ ### Usage [<img src="https://colab.research.google.com/assets/colab-badge.svg">](https://colab.research.google.com/github/ISYSLAB-HUST/NeuroPred-PLM/blob/master/notebook/NeuroPred_PLM_test.ipynb)
15
+
16
+
17
+ ```
18
+ import torch
19
+ from NeuroPredPLM.predict import predict
20
+ data = [
21
+ ("peptide_1", "IGLRLPNMLKF"),
22
+ ("peptide_2", "QAAQFKVWSASELVD"),
23
+ ("peptide_3","LRSPKMMHKSGCFGRRLDRIGSLSGLGCNVLRKY")
24
+ ]
25
+
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ neuropeptide_pred = predict(data,device)
28
+ # {peptide_id:[Type:int(1->neuropeptide,0->non-neuropeptide), attention score:nd.array]}
29
+ ```
30
+ ### License
31
+ Released under the [MIT license](LICENSE).
32
+
33
+ ### Contact
34
+ If you have any questions, comments, or would like to report a bug, please file a Github issue or contact me at wanglei94@hust.edu.cn.
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:154841aade40ce25f75ee9028046b361001c90de1cd2c6fd09ead97de076de8a
3
+ size 340609839