wnagleiofficial commited on
Commit
e39cbff
·
1 Parent(s): f556603

Add application file

Browse files
NeuroPredPLM/__init__.py ADDED
File without changes
NeuroPredPLM/args.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51bbe01a0f9d64a23fc40c16fae8454188cb5ff6e1b661114490ef7e90718df1
3
+ size 4271
NeuroPredPLM/model.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ main model
3
+ """
4
+ import torch
5
+ from torch import nn
6
+ import numpy as np
7
+ import torch.nn.functional as F
8
+ from einops import rearrange
9
+ import os
10
+
11
+ from .utils import length_to_mask, load_model_and_alphabet_core
12
+
13
+
14
+ class EsmModel(nn.Module):
15
+ def __init__(self, hidden_size=64, num_labels=2, projection_size=24, head=12):
16
+ super().__init__()
17
+
18
+ basedir = os.path.abspath(os.path.dirname(__file__))
19
+ self.esm, self.alphabet = load_model_and_alphabet_core(os.path.join(basedir, 'args.pt'))
20
+ self.num_labels = num_labels
21
+ self.head = head
22
+ self.hidden_size = hidden_size
23
+ self.projection = nn.Linear(hidden_size, projection_size)
24
+ self.cov_1 = nn.Conv1d(projection_size, projection_size, kernel_size=3, padding='same')
25
+ self.cov_2 = nn.Conv1d(projection_size, int(projection_size/2), kernel_size=1, padding='same')
26
+ # self.gating = nn.Linear(projection_size, projection_size)
27
+ self.W = nn.Parameter(torch.randn((head, int(projection_size/2))))
28
+ # self.mu = nn.Parameter(torch.randn((1, 768)))
29
+ self.fcn = nn.Sequential(nn.Linear(int(projection_size/2)*head, int(projection_size/2)),
30
+ nn.ReLU(), nn.Linear(int(projection_size/2), num_labels))
31
+
32
+
33
+ def forward(self, peptide_list, device='cpu'):
34
+ peptide_length = [len(i[1]) for i in peptide_list]
35
+ batch_converter = self.alphabet.get_batch_converter()
36
+ _, _, batch_tokens = batch_converter(peptide_list)
37
+ batch_tokens = batch_tokens.to(device)
38
+ protein_dict = self.esm(batch_tokens, repr_layers=[12], return_contacts=False)
39
+ protein_embeddings = protein_dict["representations"][12][:, 1:, :]
40
+ protein_embed = rearrange(protein_embeddings, 'b l (h d)-> (b h) l d', h=self.head)
41
+ representations = self.projection(protein_embed)
42
+ representations = rearrange(representations, 'b l d -> b d l')
43
+ representation_cov = F.relu(self.cov_1(representations))
44
+ representation_cov = F.relu(self.cov_2(representation_cov))
45
+ representations = rearrange(representation_cov, '(b h) d l -> b h l d', h=self.head)
46
+ att = torch.einsum('bhld,hd->bhl', representations, self.W)
47
+ mask = length_to_mask(torch.tensor(peptide_length)).to(device)
48
+ att = att.masked_fill(mask.unsqueeze(1)==0, -np.inf)
49
+ att= F.softmax(att, dim=-1)
50
+ # print(att)
51
+ representations = rearrange(representations * att.unsqueeze(-1), 'b h l d -> b l (h d)')
52
+ representations = torch.sum(representations, dim=1)
53
+ return self.fcn(representations), att
54
+
55
+
NeuroPredPLM/predict.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .model import EsmModel
2
+ from .utils import load_hub_workaround
3
+ import torch
4
+
5
+ def predict(peptide_list, model_path, device='cpu'):
6
+ with torch.no_grad():
7
+ neuroPred_model = EsmModel()
8
+ neuroPred_model.eval()
9
+ # state_dict = load_hub_workaround(MODEL_URL)
10
+ state_dict = torch.load(model_path, map_location="cpu")
11
+ neuroPred_model.load_state_dict(state_dict)
12
+ neuroPred_model = neuroPred_model.to(device)
13
+ prob, att = neuroPred_model(peptide_list, device)
14
+ pred = torch.softmax(prob, dim=-1).cpu().tolist()
15
+ att = att.cpu().numpy()
16
+ out = {'Neuroppetide':pred[0][1], "Non-neuropeptide":pred[0][0]}
17
+ return out
18
+
NeuroPredPLM/utils.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import esm
3
+ from argparse import Namespace
4
+ import pathlib
5
+ import urllib
6
+
7
+ def length_to_mask(length, max_len=None, dtype=None):
8
+ """length: B.
9
+ return B x max_len.
10
+ If max_len is None, then max of length will be used.
11
+ """
12
+ assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
13
+ max_len = max_len or length.max().item()
14
+ mask = torch.arange(max_len, device=length.device,
15
+ dtype=length.dtype).expand(len(length), max_len) < length.unsqueeze(1)
16
+ if dtype is not None:
17
+ mask = torch.as_tensor(mask, dtype=dtype, device=length.device)
18
+ return mask
19
+
20
+
21
+ def load_model_and_alphabet_core(args_dict, regression_data=None):
22
+ args_dict = torch.load(args_dict)
23
+ alphabet = esm.Alphabet.from_architecture(args_dict["args"].arch)
24
+
25
+ # upgrade state dict
26
+ pra = lambda s: "".join(s.split("decoder_")[1:] if "decoder" in s else s)
27
+ prs = lambda s: "".join(s.split("decoder.")[1:] if "decoder" in s else s)
28
+ model_args = {pra(arg[0]): arg[1] for arg in vars(args_dict["args"]).items()}
29
+ model_type = esm.ProteinBertModel
30
+
31
+ model = model_type(
32
+ Namespace(**model_args),
33
+ alphabet,
34
+ )
35
+ return model, alphabet
36
+
37
+
38
+ def load_hub_workaround(url):
39
+ try:
40
+ data = torch.hub.load_state_dict_from_url(url, progress=False, map_location="cpu")
41
+ except RuntimeError:
42
+ # Pytorch version issue - see https://github.com/pytorch/pytorch/issues/43106
43
+ fn = pathlib.Path(url).name
44
+ data = torch.load(
45
+ f"{torch.hub.get_dir()}/checkpoints/{fn}",
46
+ map_location="cpu",
47
+ )
48
+ except urllib.error.HTTPError as e:
49
+ raise Exception(f"Could not load {url}, check your network!")
50
+ return data
README.md CHANGED
@@ -1,5 +1,5 @@
1
  ---
2
- title: NeuroPred PLM
3
  emoji: 😻
4
  colorFrom: yellow
5
  colorTo: pink
 
1
  ---
2
+ title: NeuroPred-PLM
3
  emoji: 😻
4
  colorFrom: yellow
5
  colorTo: pink
app.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from NeuroPredPLM.predict import predict
3
+ import gradio as gr
4
+ from io import StringIO
5
+ from Bio import SeqIO
6
+
7
+ def classifier(peptide_seq):
8
+ handle = StringIO(peptide_seq)
9
+ data = []
10
+ for record in SeqIO.parse(handle, 'fasta'):
11
+ data.append((record.id, record.seq))
12
+ device = "cuda" if torch.cuda.is_available() else "cpu"
13
+ neuropeptide_pred = predict(data,device)
14
+ return neuropeptide_pred
15
+ # {peptide_id:[Type:int(1->neuropeptide,0->non-neuropeptide), attention score:nd.array]}
16
+
17
+ iface = gr.Interface(fn=classifier, inputs=gr.Textbox(
18
+ label="Input peptide sequence",
19
+ lines=3,
20
+ value=">peptide-1\nIGLRLPNMLKF",
21
+ ), outputs=gr.outputs.Label(num_top_classes=2), title="NeuroPred-PLM")
22
+ iface.launch()
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:154841aade40ce25f75ee9028046b361001c90de1cd2c6fd09ead97de076de8a
3
+ size 340609839
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch
2
+ einops
3
+ numpy
4
+ biopython