| import numpy as np |
| from transformers import AutoTokenizer, AutoModelForSequenceClassification |
| import pandas as pd |
| from transformers import set_seed |
| import torch |
| import torch.nn as nn |
| from collections import OrderedDict |
| import warnings |
| import gradio as gr |
| from tqdm import tqdm |
|
|
| warnings.filterwarnings('ignore') |
| set_seed(4) |
| device = "cpu" |
| model_checkpoint = "esm2_t30_150M_UR50D" |
|
|
| class MyModel(nn.Module): |
| def __init__(self): |
| super().__init__() |
| self.bert = AutoModelForSequenceClassification.from_pretrained(model_checkpoint,num_labels=320) |
| self.bn1 = nn.BatchNorm1d(256) |
| self.bn2 = nn.BatchNorm1d(128) |
| self.bn3 = nn.BatchNorm1d(64) |
| self.relu = nn.ReLU() |
| self.fc1 = nn.Linear(320,256) |
| self.fc2 = nn.Linear(256,128) |
| self.fc3 = nn.Linear(128,64) |
| self.output_layer = nn.Linear(64,2) |
| self.dropout = nn.Dropout(0) |
|
|
| def forward(self,x): |
| with torch.no_grad(): |
| bert_output = self.bert(input_ids=x['input_ids'].to(device),attention_mask=x['attention_mask'].to(device)) |
| output_feature = self.dropout(bert_output["logits"]) |
| output_feature = self.relu(self.bn1(self.fc1(output_feature))) |
| output_feature = self.relu(self.bn2(self.fc2(output_feature))) |
| output_feature = self.relu(self.bn3(self.fc3(output_feature))) |
| output_feature = self.output_layer(output_feature) |
| return torch.softmax(output_feature,dim=1) |
|
|
| def Kmers_funct(seq,num): |
| for i in range(len(seq)): |
| a = seq[i] |
| l = [] |
| for index in range(len(a)): |
| t = a[index:index + num] |
| if (len(t)) == num: |
| l.append(t) |
| return l |
|
|
| def ACE(file): |
| |
| test_seqs = [line.strip() for line in file.strip().split('\n') if line.strip()] |
| all = [] |
| for test_seq in test_seqs: |
| seq_len = len(test_seq) |
| if seq_len > 30: |
| for j in range(2, 11): |
| X = Kmers_funct([test_seq], j) |
| all.extend(X) |
| else: |
| all.append(test_seq) |
| model = MyModel() |
| model.load_state_dict(torch.load("best_model.pth", map_location=torch.device('cpu')), strict=False) |
| model = model.to(device) |
| model.eval() |
| tokenizer = AutoTokenizer.from_pretrained(model_checkpoint) |
| max_len = 30 |
|
|
| seq_all = [] |
| output_all = [] |
| probability_all = [] |
| for seq in tqdm(all): |
| test_data = tokenizer(seq, max_length=max_len, padding="max_length",truncation=True, return_tensors='pt') |
| out_probability = [] |
| with torch.no_grad(): |
| predict = model(test_data) |
| out_probability.extend(np.max(np.array(predict.cpu()),axis=1).tolist()) |
| test_argmax = np.argmax(predict.cpu(), axis=1).tolist() |
| id2str = {0:"non-ACE", 1:"ACE"} |
| output = id2str[test_argmax[0]] |
| probability = out_probability[0] |
| seq_all.append(seq) |
| output_all.append(output) |
| probability_all.append(probability) |
|
|
| summary = OrderedDict() |
| summary['Seq'] = seq_all |
| summary['Class'] = output_all |
| summary['Probability'] = probability_all |
| summary_df = pd.DataFrame(summary) |
| summary_df.to_csv('output.csv', index=False) |
| |
| if seq_len > 30: |
| out_text = "None" |
| out_prob = "None" |
|
|
| else: |
| out_text = output |
| out_prob = probability |
| |
|
|
| return 'output.csv', out_text, out_prob |
|
|
| with open("ACE.md", "r") as f: |
| description = f.read() |
| iface = gr.Interface(fn=ACE, |
| title="🏹DeepAngio", |
| inputs=gr.Textbox(show_label=False, placeholder="Each row contains one peptide or protein", lines=5), |
| outputs= ["file",gr.Textbox(show_label=False, placeholder="class", lines=1),gr.Textbox(show_label=False, placeholder="probability", lines=1)], |
| description=description) |
| iface.launch() |