Spaces:
Runtime error
Runtime error
wnagleiofficial
commited on
Commit
·
e39cbff
1
Parent(s):
f556603
Add application file
Browse files- NeuroPredPLM/__init__.py +0 -0
- NeuroPredPLM/args.pt +3 -0
- NeuroPredPLM/model.py +55 -0
- NeuroPredPLM/predict.py +18 -0
- NeuroPredPLM/utils.py +50 -0
- README.md +1 -1
- app.py +22 -0
- model.pth +3 -0
- requirements.txt +4 -0
NeuroPredPLM/__init__.py
ADDED
|
File without changes
|
NeuroPredPLM/args.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51bbe01a0f9d64a23fc40c16fae8454188cb5ff6e1b661114490ef7e90718df1
|
| 3 |
+
size 4271
|
NeuroPredPLM/model.py
ADDED
|
@@ -0,0 +1,55 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
main model
|
| 3 |
+
"""
|
| 4 |
+
import torch
|
| 5 |
+
from torch import nn
|
| 6 |
+
import numpy as np
|
| 7 |
+
import torch.nn.functional as F
|
| 8 |
+
from einops import rearrange
|
| 9 |
+
import os
|
| 10 |
+
|
| 11 |
+
from .utils import length_to_mask, load_model_and_alphabet_core
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class EsmModel(nn.Module):
|
| 15 |
+
def __init__(self, hidden_size=64, num_labels=2, projection_size=24, head=12):
|
| 16 |
+
super().__init__()
|
| 17 |
+
|
| 18 |
+
basedir = os.path.abspath(os.path.dirname(__file__))
|
| 19 |
+
self.esm, self.alphabet = load_model_and_alphabet_core(os.path.join(basedir, 'args.pt'))
|
| 20 |
+
self.num_labels = num_labels
|
| 21 |
+
self.head = head
|
| 22 |
+
self.hidden_size = hidden_size
|
| 23 |
+
self.projection = nn.Linear(hidden_size, projection_size)
|
| 24 |
+
self.cov_1 = nn.Conv1d(projection_size, projection_size, kernel_size=3, padding='same')
|
| 25 |
+
self.cov_2 = nn.Conv1d(projection_size, int(projection_size/2), kernel_size=1, padding='same')
|
| 26 |
+
# self.gating = nn.Linear(projection_size, projection_size)
|
| 27 |
+
self.W = nn.Parameter(torch.randn((head, int(projection_size/2))))
|
| 28 |
+
# self.mu = nn.Parameter(torch.randn((1, 768)))
|
| 29 |
+
self.fcn = nn.Sequential(nn.Linear(int(projection_size/2)*head, int(projection_size/2)),
|
| 30 |
+
nn.ReLU(), nn.Linear(int(projection_size/2), num_labels))
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def forward(self, peptide_list, device='cpu'):
|
| 34 |
+
peptide_length = [len(i[1]) for i in peptide_list]
|
| 35 |
+
batch_converter = self.alphabet.get_batch_converter()
|
| 36 |
+
_, _, batch_tokens = batch_converter(peptide_list)
|
| 37 |
+
batch_tokens = batch_tokens.to(device)
|
| 38 |
+
protein_dict = self.esm(batch_tokens, repr_layers=[12], return_contacts=False)
|
| 39 |
+
protein_embeddings = protein_dict["representations"][12][:, 1:, :]
|
| 40 |
+
protein_embed = rearrange(protein_embeddings, 'b l (h d)-> (b h) l d', h=self.head)
|
| 41 |
+
representations = self.projection(protein_embed)
|
| 42 |
+
representations = rearrange(representations, 'b l d -> b d l')
|
| 43 |
+
representation_cov = F.relu(self.cov_1(representations))
|
| 44 |
+
representation_cov = F.relu(self.cov_2(representation_cov))
|
| 45 |
+
representations = rearrange(representation_cov, '(b h) d l -> b h l d', h=self.head)
|
| 46 |
+
att = torch.einsum('bhld,hd->bhl', representations, self.W)
|
| 47 |
+
mask = length_to_mask(torch.tensor(peptide_length)).to(device)
|
| 48 |
+
att = att.masked_fill(mask.unsqueeze(1)==0, -np.inf)
|
| 49 |
+
att= F.softmax(att, dim=-1)
|
| 50 |
+
# print(att)
|
| 51 |
+
representations = rearrange(representations * att.unsqueeze(-1), 'b h l d -> b l (h d)')
|
| 52 |
+
representations = torch.sum(representations, dim=1)
|
| 53 |
+
return self.fcn(representations), att
|
| 54 |
+
|
| 55 |
+
|
NeuroPredPLM/predict.py
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .model import EsmModel
|
| 2 |
+
from .utils import load_hub_workaround
|
| 3 |
+
import torch
|
| 4 |
+
|
| 5 |
+
def predict(peptide_list, model_path, device='cpu'):
|
| 6 |
+
with torch.no_grad():
|
| 7 |
+
neuroPred_model = EsmModel()
|
| 8 |
+
neuroPred_model.eval()
|
| 9 |
+
# state_dict = load_hub_workaround(MODEL_URL)
|
| 10 |
+
state_dict = torch.load(model_path, map_location="cpu")
|
| 11 |
+
neuroPred_model.load_state_dict(state_dict)
|
| 12 |
+
neuroPred_model = neuroPred_model.to(device)
|
| 13 |
+
prob, att = neuroPred_model(peptide_list, device)
|
| 14 |
+
pred = torch.softmax(prob, dim=-1).cpu().tolist()
|
| 15 |
+
att = att.cpu().numpy()
|
| 16 |
+
out = {'Neuroppetide':pred[0][1], "Non-neuropeptide":pred[0][0]}
|
| 17 |
+
return out
|
| 18 |
+
|
NeuroPredPLM/utils.py
ADDED
|
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import esm
|
| 3 |
+
from argparse import Namespace
|
| 4 |
+
import pathlib
|
| 5 |
+
import urllib
|
| 6 |
+
|
| 7 |
+
def length_to_mask(length, max_len=None, dtype=None):
|
| 8 |
+
"""length: B.
|
| 9 |
+
return B x max_len.
|
| 10 |
+
If max_len is None, then max of length will be used.
|
| 11 |
+
"""
|
| 12 |
+
assert len(length.shape) == 1, 'Length shape should be 1 dimensional.'
|
| 13 |
+
max_len = max_len or length.max().item()
|
| 14 |
+
mask = torch.arange(max_len, device=length.device,
|
| 15 |
+
dtype=length.dtype).expand(len(length), max_len) < length.unsqueeze(1)
|
| 16 |
+
if dtype is not None:
|
| 17 |
+
mask = torch.as_tensor(mask, dtype=dtype, device=length.device)
|
| 18 |
+
return mask
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
def load_model_and_alphabet_core(args_dict, regression_data=None):
|
| 22 |
+
args_dict = torch.load(args_dict)
|
| 23 |
+
alphabet = esm.Alphabet.from_architecture(args_dict["args"].arch)
|
| 24 |
+
|
| 25 |
+
# upgrade state dict
|
| 26 |
+
pra = lambda s: "".join(s.split("decoder_")[1:] if "decoder" in s else s)
|
| 27 |
+
prs = lambda s: "".join(s.split("decoder.")[1:] if "decoder" in s else s)
|
| 28 |
+
model_args = {pra(arg[0]): arg[1] for arg in vars(args_dict["args"]).items()}
|
| 29 |
+
model_type = esm.ProteinBertModel
|
| 30 |
+
|
| 31 |
+
model = model_type(
|
| 32 |
+
Namespace(**model_args),
|
| 33 |
+
alphabet,
|
| 34 |
+
)
|
| 35 |
+
return model, alphabet
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def load_hub_workaround(url):
|
| 39 |
+
try:
|
| 40 |
+
data = torch.hub.load_state_dict_from_url(url, progress=False, map_location="cpu")
|
| 41 |
+
except RuntimeError:
|
| 42 |
+
# Pytorch version issue - see https://github.com/pytorch/pytorch/issues/43106
|
| 43 |
+
fn = pathlib.Path(url).name
|
| 44 |
+
data = torch.load(
|
| 45 |
+
f"{torch.hub.get_dir()}/checkpoints/{fn}",
|
| 46 |
+
map_location="cpu",
|
| 47 |
+
)
|
| 48 |
+
except urllib.error.HTTPError as e:
|
| 49 |
+
raise Exception(f"Could not load {url}, check your network!")
|
| 50 |
+
return data
|
README.md
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
---
|
| 2 |
-
title: NeuroPred
|
| 3 |
emoji: 😻
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: pink
|
|
|
|
| 1 |
---
|
| 2 |
+
title: NeuroPred-PLM
|
| 3 |
emoji: 😻
|
| 4 |
colorFrom: yellow
|
| 5 |
colorTo: pink
|
app.py
ADDED
|
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from NeuroPredPLM.predict import predict
|
| 3 |
+
import gradio as gr
|
| 4 |
+
from io import StringIO
|
| 5 |
+
from Bio import SeqIO
|
| 6 |
+
|
| 7 |
+
def classifier(peptide_seq):
|
| 8 |
+
handle = StringIO(peptide_seq)
|
| 9 |
+
data = []
|
| 10 |
+
for record in SeqIO.parse(handle, 'fasta'):
|
| 11 |
+
data.append((record.id, record.seq))
|
| 12 |
+
device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 13 |
+
neuropeptide_pred = predict(data,device)
|
| 14 |
+
return neuropeptide_pred
|
| 15 |
+
# {peptide_id:[Type:int(1->neuropeptide,0->non-neuropeptide), attention score:nd.array]}
|
| 16 |
+
|
| 17 |
+
iface = gr.Interface(fn=classifier, inputs=gr.Textbox(
|
| 18 |
+
label="Input peptide sequence",
|
| 19 |
+
lines=3,
|
| 20 |
+
value=">peptide-1\nIGLRLPNMLKF",
|
| 21 |
+
), outputs=gr.outputs.Label(num_top_classes=2), title="NeuroPred-PLM")
|
| 22 |
+
iface.launch()
|
model.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:154841aade40ce25f75ee9028046b361001c90de1cd2c6fd09ead97de076de8a
|
| 3 |
+
size 340609839
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
einops
|
| 3 |
+
numpy
|
| 4 |
+
biopython
|