Spaces:

xiaoleon
/

DeepPD-hf

Sleeping

App Files Files Community

xiaoleon commited on Jan 6, 2024

Commit

46b9840

1 Parent(s): 8a12c65

initial submission

Browse files

Files changed (28) hide show

DeepPD/BERT/config.json +1 -0
DeepPD/BERT/pytorch_model.bin +3 -0
DeepPD/BERT/vocab.txt +0 -0
DeepPD/ESM2/config.json +30 -0
DeepPD/ESM2/esm2_t12_35M_UR50D-contact-regression.pt +3 -0
DeepPD/ESM2/esm2_t12_35M_UR50D.pt +3 -0
DeepPD/ESM2/model_index.json +33 -0
DeepPD/ESM2/special_tokens_map.json +7 -0
DeepPD/ESM2/tokenizer_config.json +4 -0
DeepPD/ESM2/vocab.txt +33 -0
DeepPD/__pycache__/config.cpython-38.pyc +0 -0
DeepPD/__pycache__/data_helper.cpython-38.pyc +0 -0
DeepPD/__pycache__/model.cpython-38.pyc +0 -0
DeepPD/__pycache__/predictor.cpython-38.pyc +0 -0
DeepPD/__pycache__/utils.cpython-38.pyc +0 -0
DeepPD/__pycache__/utils_etfc.cpython-38.pyc +0 -0
DeepPD/config.py +37 -0
DeepPD/data_helper.py +195 -0
DeepPD/model.py +226 -0
DeepPD/predictor.py +26 -0
DeepPD/utils.py +71 -0
DeepPD/utils_etfc.py +367 -0
app.ipynb +205 -0
app.py +83 -0
homo_test.fa +12 -0
requirements.txt +7 -0
weight-Homo/4.pth +3 -0
weight-Mus/4.pth +3 -0

DeepPD/BERT/config.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"hidden_size": 128, "hidden_act": "gelu", "initializer_range": 0.02, "vocab_size": 30522, "hidden_dropout_prob": 0.1, "num_attention_heads": 2, "type_vocab_size": 2, "max_position_embeddings": 512, "num_hidden_layers": 2, "intermediate_size": 512, "attention_probs_dropout_prob": 0.1}

DeepPD/BERT/pytorch_model.bin ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1e28abb3688c8927a0dc41d37b6b9d6e30c6c7419e5311d55ce30ed55843da91
+size 17755352

DeepPD/BERT/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

DeepPD/ESM2/config.json ADDED Viewed

	@@ -0,0 +1,30 @@

+{
+  "_name_or_path": "/tmp/facebook/esm2_t12_35M_UR50D",
+  "architectures": [
+    "EsmForMaskedLM"
+  ],
+  "attention_probs_dropout_prob": 0.0,
+  "classifier_dropout": null,
+  "emb_layer_norm_before": false,
+  "esmfold_config": null,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.0,
+  "hidden_size": 480,
+  "initializer_range": 0.02,
+  "intermediate_size": 1920,
+  "is_folding_model": false,
+  "layer_norm_eps": 1e-05,
+  "mask_token_id": 32,
+  "max_position_embeddings": 1026,
+  "model_type": "esm",
+  "num_attention_heads": 20,
+  "num_hidden_layers": 12,
+  "pad_token_id": 1,
+  "position_embedding_type": "rotary",
+  "token_dropout": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.25.0.dev0",
+  "use_cache": true,
+  "vocab_list": null,
+  "vocab_size": 33
+}

DeepPD/ESM2/esm2_t12_35M_UR50D-contact-regression.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:16641e05d830d0ce863dd152dbb8c2f3ddfa3c3ec2a66080152c8abad01d8585
+size 1959

DeepPD/ESM2/esm2_t12_35M_UR50D.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:7f21e80e61d16a71735163ef555d3009afb0c98da74c48e29df08606973cc55e
+size 134095705

DeepPD/ESM2/model_index.json ADDED Viewed

	@@ -0,0 +1,33 @@

+{
+  "_class_name": "StableDiffusionPipeline",
+  "_diffusers_version": "0.8.0",
+  "feature_extractor": [
+    "transformers",
+    "CLIPFeatureExtractor"
+  ],
+  "safety_checker": [
+    null,
+    null
+  ],
+  "scheduler": [
+    "diffusers",
+    "DDIMScheduler"
+  ],
+  "text_encoder": [
+    "transformers",
+    "CLIPTextModel"
+  ],
+  "tokenizer": [
+    "transformers",
+    "CLIPTokenizer"
+  ],
+  "unet": [
+    "diffusers",
+    "UNet2DConditionModel"
+  ],
+  "vae": [
+    "diffusers",
+    "AutoencoderKL"
+  ],
+  "requires_safety_checker": false
+}

DeepPD/ESM2/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "<cls>",
+  "eos_token": "<eos>",
+  "mask_token": "<mask>",
+  "pad_token": "<pad>",
+  "unk_token": "<unk>"
+}

DeepPD/ESM2/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,4 @@

+{
+  "model_max_length": 1000000000000000019884624838656,
+  "tokenizer_class": "EsmTokenizer"
+}

DeepPD/ESM2/vocab.txt ADDED Viewed

	@@ -0,0 +1,33 @@

+<cls>
+<pad>
+<eos>
+<unk>
+L
+A
+G
+V
+S
+E
+R
+T
+I
+D
+P
+K
+Q
+N
+F
+Y
+M
+H
+W
+C
+X
+B
+U
+Z
+O
+.
+-
+<null_1>
+<mask>

DeepPD/__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (1.03 kB). View file

DeepPD/__pycache__/data_helper.cpython-38.pyc ADDED Viewed

Binary file (6.37 kB). View file

DeepPD/__pycache__/model.cpython-38.pyc ADDED Viewed

Binary file (5.86 kB). View file

DeepPD/__pycache__/predictor.cpython-38.pyc ADDED Viewed

Binary file (1.17 kB). View file

DeepPD/__pycache__/utils.cpython-38.pyc ADDED Viewed

Binary file (2.28 kB). View file

DeepPD/__pycache__/utils_etfc.cpython-38.pyc ADDED Viewed

Binary file (10.7 kB). View file

DeepPD/config.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import os
+import torch
+class ArgsConfig:
+    def __init__(self) -> None:
+        self.batch_size = 192
+        self.embedding_size = 480
+        self.epochs = 50
+        self.kflod = 5
+        self.max_len = 40
+        self.lr = 1.5e-3
+        self.weight_decay = 0
+        self.is_autocast = False
+        self.info_bottleneck = False
+        self.dropout = 0.6
+        self.IB_beta = 1e-3
+        self.model_name = 'DeepPD_C' #
+        self.exp_nums = 0.0
+        self.aa_dict = 'esm' # 'protbert' /'esm'/ None
+        self.info = f""  #对当前训练做的补充说明
+        # self.data_c_dir = './data/GPMDB_Homo_sapiens_20190115/sorted_GPMDB_Homo_0.025_0.9.csv'
+        # self.data_c1_dir = './data/GPMDB_Homo_sapiens_20190115/sorted_GPMDB_Homo_0.025.csv'
+        # self.data_homo_dir = './data/PepFormer/Homo_0.9.csv'
+        # self.data_mus_dir = './data/PepFormer/Mus_0.9.csv'
+        # self.log_dir = './result/logs'
+        # self.save_dir = './result/model_para'
+        # self.tensorboard_log_dir = './tensorboard'
+        self.ems_path = './DeepPD/ESM2/esm2_t12_35M_UR50D.pt'
+        self.esm_layer_idx = 12
+        # self.save_para_dir = os.path.join(self.save_dir,self.model_name)
+        self.random_seed = 2023
+        self.num_classes = 21
+        self.split_size = 0.8
+        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DeepPD/data_helper.py ADDED Viewed

	@@ -0,0 +1,195 @@

+import numpy as np
+import torch
+import torch.nn.utils.rnn as rnn_utils
+def Data2EqlTensor(lines,max_len):
+    # aa_dict = {'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,'V':18,'W':19,'Y':20,'U':0,'X':0}
+    aa_dict = {'[PAD]': 1, 'L': 4, 'A': 5, 'G': 6, 'V': 7, 'S': 8, 'E': 9, 'R': 10, 'T': 11, 'I': 12, 'D': 13, 'P': 14, 'K': 15, 'Q': 16,
+                   'N': 17, 'F': 18, 'Y': 19, 'M': 20, 'H': 21, 'W': 22, 'C': 23, 'X': 1, 'B': 1, 'U': 1, 'Z': 1, 'O': 1}
+    padding_key = '[PAD]'
+    default_padding_value = 1
+    if padding_key in aa_dict:
+        dict_padding_value = aa_dict.get('[PAD]')
+    else:
+        dict_padding_value = default_padding_value
+        print(f"No padding value in the implicit dictionary, set to {default_padding_value} by default")
+    print('default_padding_value:',default_padding_value)
+    long_pep_counter=0
+    pep_codes=[]
+    ids = []
+    pad_flag = 1
+    for id,pep in lines:
+        ids.append(id)
+        x = len(pep)
+        # 将第一个长度<max_len的序列填充到40，确保当输入序列均<max_len时，所有序列仍然能够填充到max_len
+        if  x < max_len:
+            current_pep=[]
+            for aa in pep:
+                current_pep.append(aa_dict[aa])
+            if pad_flag:
+                current_pep.extend([dict_padding_value] * (max_len - len(current_pep)))
+                pad_flag = 0
+            pep_codes.append(torch.tensor(current_pep)) #torch.tensor(current_pep)
+        else:
+            pep_head = pep[0:int(max_len/2)]
+            pep_tail = pep[int(x-int(max_len/2)):int(x)]
+            new_pep = pep_head+pep_tail
+            current_pep=[]
+            for aa in new_pep:
+                current_pep.append(aa_dict[aa])
+            pep_codes.append(torch.tensor(current_pep))
+            long_pep_counter += 1
+    print("length>"+str(max_len)+':',long_pep_counter)
+    data = rnn_utils.pad_sequence(pep_codes,batch_first=True,padding_value=dict_padding_value)
+    return data,ids
+def Seqs2EqlTensor(file_path:str,max_len:int,AminoAcid_vocab=None):
+    '''
+    Args:
+        flie:文件路径 \n
+        max_len:设定转换后的氨基酸序列最大长度 \n
+        vocab_dict:esm / protbert / default ,默认为按顺序映射的词典
+    '''
+    # 只保留20种氨基酸和填充数,其余几种非常规氨基酸均用填充数代替
+    # 使用 esm和portbert字典时，nn.embedding()的vocab_size = 25
+    if AminoAcid_vocab =='esm':
+        aa_dict = {'[PAD]': 1, 'L': 4, 'A': 5, 'G': 6, 'V': 7, 'S': 8, 'E': 9, 'R': 10, 'T': 11, 'I': 12, 'D': 13, 'P': 14, 'K': 15, 'Q': 16,
+                   'N': 17, 'F': 18, 'Y': 19, 'M': 20, 'H': 21, 'W': 22, 'C': 23, 'X': 1, 'B': 1, 'U': 1, 'Z': 1, 'O': 1}
+    elif AminoAcid_vocab == 'protbert':
+        aa_dict = {'[PAD]':0,'L': 5, 'A': 6, 'G': 7, 'V': 8, 'E': 9, 'S': 10, 'I': 11, 'K': 12, 'R': 13, 'D': 14, 'T': 15,
+               'P': 16, 'N': 17, 'Q': 18, 'F': 19, 'Y': 20, 'M': 21, 'H': 22, 'C': 23, 'W': 24, 'X': 0, 'U': 0, 'B': 0, 'Z': 0, 'O': 0}
+    else:
+        aa_dict = {'[PAD]':0,'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,
+               'S':16,'T':17,'V':18,'W':19,'Y':20,'U':0,'X':0,'J':0}
+        # aa_dict = {'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,'S':16,'T':17,'V':18,'W':19,'Y':20,'U':0,'X':0}
+    ## Esm vocab
+    ## protbert vocab
+    padding_key = '[PAD]'
+    default_padding_value = 0
+    if padding_key in aa_dict:
+        dict_padding_value = aa_dict.get('[PAD]')
+    else:
+        dict_padding_value = default_padding_value
+        print(f"No padding value in the implicit dictionary, set to {default_padding_value} by default")
+    with open(file_path, 'r') as inf:
+        lines = inf.read().splitlines()
+    # assert len(lines) % 2 == 0, "Invalid file format. Number of lines should be even."
+    long_pep_counter=0
+    pep_codes=[]
+    labels=[]
+    pos_count = 0
+    neg_count = 0
+    for line in lines:
+        pep,label = line.split(",")
+        labels.append(int(label))
+        if int(label) == int(1):
+            pos_count+=1
+        else:
+            neg_count+=1
+        seq_len = len(pep)
+        if  seq_len <= max_len:
+            current_pep=[]
+            for aa in pep:
+                if aa.upper() in aa_dict.keys():
+                    current_pep.append(aa_dict[aa.upper()])
+            pep_codes.append(torch.tensor(current_pep)) #torch.tensor(current_pep)
+        else:
+            pep_head = pep[0:int(max_len/2)]
+            pep_tail = pep[int(seq_len-int(max_len/2)):int(seq_len)]
+            new_pep = pep_head+pep_tail
+            current_pep=[]
+            for aa in new_pep:
+                current_pep.append(aa_dict[aa])
+            pep_codes.append(torch.tensor(current_pep))
+            long_pep_counter += 1
+    print("length > {}:{},postive sample:{},negative sample:{}".format(max_len,long_pep_counter,pos_count,neg_count))
+    data = rnn_utils.pad_sequence(pep_codes,batch_first=True,padding_value=dict_padding_value)
+    return data,torch.tensor(labels)
+def Numseq2OneHot(numseq):
+    OneHot = []
+    for seq in numseq:
+        len_seq = len(seq)
+        seq = seq.cpu().numpy()
+        x = torch.zeros(len_seq,20)
+        for i in range(len_seq):
+            x[i][seq[i]-1] = 1
+        OneHot.append(np.array(x))
+    return torch.tensor(np.array(OneHot))
+def index_alignment(batch,condition_num=0,subtraction_num1=4,subtraction_num2=1):
+    '''将其他蛋白质语言模型的字典索引和默认字典索引进行对齐，保持氨基酸索引只有20个数构成，且范围在[1,20]，[PAD]=0或者1 \n
+    "esm"模型，condition_num=1,subtraction_num1=3，subtraction_num2=1； \n
+    "protbert"模型，condition_num=0,subtraction_num1=4
+    Args:
+        batch:形状为[batch_size,seq_len]的二维张量 \n
+        condition_num:字典中的[PAD]值 \n
+        subtraction_num1:对齐非[PAD]元素所需减掉的差值 \n
+        subtraction_num2:对齐[PAD]元素所需减掉的差值
+    return:
+        shape:[batch_size,seq_len],dtype=tensor.
+    '''
+    condition = batch == condition_num
+    # 创建一个张量，形状和batch相同，表示非[PAD]元素要减去的值
+    subtraction = torch.full_like(batch, subtraction_num1)
+    if condition_num==0:
+        # 使用torch.where()函数来选择batch中为0的元素或者batch减去subtraction中的元素
+        output = torch.where(condition, batch, batch - subtraction)
+    elif condition_num==1:
+        # 创建一个张量，形状和batch相同，表示[PAD]元素要减去的值
+        subtraction_2 = torch.full_like(batch, subtraction_num2)
+        output = torch.where(condition, batch-subtraction_2, batch - subtraction)
+    return output
+blosum62 = {
+        '1': [4, -1, -2, -2, 0,  -1, -1, 0, -2,  -1, -1, -1, -1, -2, -1, 1,  0,  -3, -2, 0],  # A
+        '15': [-1, 5,  0, -2, -3, 1,  0,  -2, 0,  -3, -2, 2,  -1, -3, -2, -1, -1, -3, -2, -3], # R
+        '12': [-2, 0,  6,  1,  -3, 0,  0,  0,  1,  -3, -3, 0,  -2, -3, -2, 1,  0,  -4, -2, -3], # N
+        '3': [-2, -2, 1,  6,  -3, 0,  2,  -1, -1, -3, -4, -1, -3, -3, -1, 0,  -1, -4, -3, -3], # D
+        '2': [0,  -3, -3, -3, 9,  -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1], # C
+        '14': [-1, 1,  0,  0,  -3, 5,  2,  -2, 0,  -3, -2, 1,  0,  -3, -1, 0,  -1, -2, -1, -2], # Q
+        '4': [-1, 0,  0,  2,  -4, 2,  5,  -2, 0,  -3, -3, 1,  -2, -3, -1, 0,  -1, -3, -2, -2], # E
+        '6': [0,  -2, 0,  -1, -3, -2, -2, 6,  -2, -4, -4, -2, -3, -3, -2, 0,  -2, -2, -3, -3], # G
+        '7': [-2, 0,  1,  -1, -3, 0,  0,  -2, 8,  -3, -3, -1, -2, -1, -2, -1, -2, -2, 2,  -3], # H
+        '8': [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4,  2,  -3, 1,  0,  -3, -2, -1, -3, -1, 3],  # I
+        '10': [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2,  4,  -2, 2,  0,  -3, -2, -1, -2, -1, 1],  # L
+        '9': [-1, 2,  0,  -1, -3, 1,  1,  -2, -1, -3, -2, 5,  -1, -3, -1, 0,  -1, -3, -2, -2], # K
+        '11': [-1, -1, -2, -3, -1, 0,  -2, -3, -2, 1,  2,  -1, 5,  0,  -2, -1, -1, -1, -1, 1],  # M
+        '5': [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0,  0,  -3, 0,  6,  -4, -2, -2, 1,  3,  -1], # F
+        '13': [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7,  -1, -1, -4, -3, -2], # P
+        '16': [1,  -1, 1,  0,  -1, 0,  0,  0,  -1, -2, -2, 0,  -1, -2, -1, 4,  1,  -3, -2, -2], # S
+        '17': [0,  -1, 0,  -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1,  5,  -2, -2, 0],  # T
+        '19': [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1,  -4, -3, -2, 11, 2,  -3], # W
+        '20': [-2, -2, -2, -3, -2, -1, -2, -3, 2,  -1, -1, -2, -1, 3,  -3, -2, -2, 2,  7,  -1], # Y
+        '18': [0,  -3, -3, -3, -1, -2, -2, -3, -3, 3,  1,  -2, 1,  -1, -2, -2, 0,  -3, -1, 4],  # V
+        '0': [0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],  # -
+    }
+def get_blosum62(seq):
+    # 使用列表推导式和字典get方法代替循环
+    seq = seq.tolist()
+    seq2b62 = np.array([blosum62.get(str(i)) for i in seq])
+    return seq2b62
+def seqs2blosum62(sequences):
+    evolution = np.array([get_blosum62(seq) for seq in sequences],dtype=float)
+    return torch.from_numpy(evolution)

DeepPD/model.py ADDED Viewed

	@@ -0,0 +1,226 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from DeepPD.utils import CBAMBlock,Res_Net
+from DeepPD.data_helper import Numseq2OneHot
+from transformers import BertModel
+bert_wight = BertModel.from_pretrained("./DeepPD/BERT")
+class MyModel(nn.Module):
+    def __init__(self):
+        super().__init__()
+        batch_size = 64
+        vocab_size = 21
+        self.hidden_dim = 25
+        self.gru_emb = 128
+        self.emb_dim = 108
+        self.model = bert_wight
+        self.gru = nn.GRU(self.gru_emb, self.hidden_dim, num_layers=2,
+                               bidirectional=True,dropout=0.1)
+        self.embedding = nn.Embedding(vocab_size, self.emb_dim, padding_idx=0)
+        self.encoder_layer = nn.TransformerEncoderLayer(d_model=128, nhead=8)
+        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
+        self.resnet = Res_Net(batch_size)
+        self.cbamBlock = CBAMBlock(batch_size)
+        self.convblock1 = nn.Sequential(
+            nn.Conv2d(1,batch_size,1),
+            nn.BatchNorm2d(batch_size),
+            nn.LeakyReLU()
+            )
+        self.convblock2 = nn.Sequential(
+            nn.Conv2d(batch_size,1,1),
+            nn.BatchNorm2d(1),
+            nn.LeakyReLU()
+            )
+        self.fc = nn.Sequential(    nn.Linear(4200,512),
+                                    nn.BatchNorm1d(512),
+                                    nn.LeakyReLU(),
+                                    nn.Linear(512,32),
+                                    nn.BatchNorm1d(32),
+                                    nn.LeakyReLU(),
+                                    nn.Linear(32,2))
+    def forward(self, x):
+        xx = self.embedding(x)  #* 40 128  #* 40 108
+        z = Numseq2OneHot(x) #* 40 20
+        z = z.type_as(xx)
+        out = torch.cat([xx,z],2)
+        out = self.transformer_encoder(out)
+        out = out.unsqueeze(1)
+        out = self.convblock1(out) #*,32,40,128
+        out = self.resnet(out)
+        out = self.resnet(out)
+        out = self.cbamBlock(out)
+        out = self.convblock2(out) #*,1,40,128
+        out = out.squeeze(1)
+        out = out.permute(1,0,2) #40,*,128
+        out,hn = self.gru(out)
+        out = out.permute(1,0,2) #*,40,50
+        hn = hn.permute(1,0,2) #*,4,25
+        out = out.reshape(out.shape[0],-1) #* 900
+        hn = hn.reshape(hn.shape[0],-1) #* 100
+        out = torch.cat([out,hn],1) #* 1000
+        out1 = self.model(x)[0] #*,40,128
+        out1 = out1.permute(1,0,2) #40,*,128
+        out1,hn1 = self.gru(out1)
+        out1 = out1.permute(1,0,2) #*,40,50
+        hn1= hn1.permute(1,0,2) #*,4,25
+        out1 = out1.reshape(out1.shape[0],-1) #* 2000
+        hn1 = hn1.reshape(hn1.shape[0],-1) #* 100
+        out1 = torch.cat([out1,hn1],1) #* 2100
+        out = torch.cat([out1,out],1) #* 4200
+        out = self.fc(out)
+        return out
+from DeepPD.utils_etfc import *
+import torch,esm
+import torch.nn as nn
+from DeepPD.data_helper import index_alignment,seqs2blosum62
+import torch.nn.functional as f
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class DeepPD(nn.Module):
+    def __init__(self, vocab_size:int, embedding_size:int, fan_layer_num:int, num_heads:int,encoder_layer_num:int=1,seq_len: int=40,
+                 output_size:int=2, layer_idx=None,esm_path=None,dropout:float=0.6, max_pool: int=4,Contrastive_Learning=False,info_bottleneck=False):
+        super(DeepPD, self).__init__()
+        self.vocab_size = vocab_size
+        self.embedding_size = embedding_size
+        self.output_size = output_size
+        self.seq_len = seq_len
+        self.dropout = dropout
+        self.dropout_layer = nn.Dropout(self.dropout)
+        self.encoder_layer_num = encoder_layer_num
+        self.fan_layer_num = fan_layer_num
+        self.num_heads = num_heads
+        self.max_pool = max_pool
+        self.ctl = Contrastive_Learning
+        self.info_bottleneck = info_bottleneck
+        self.ESMmodel,_ = esm.pretrained.load_model_and_alphabet_local(esm_path)
+        self.ESMmodel.eval()
+        self.layer_idx = layer_idx
+        self.out_chs = 64
+        self.kernel_sizes = [3,7]
+        self.all_conv = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv1d(self.embedding_size+20,out_channels=self.out_chs,kernel_size=self.kernel_sizes[i],padding=(self.kernel_sizes[i]-1)//2), #padding=(self.kernel_sizes[i]-1)//2,
+                nn.BatchNorm1d(self.out_chs),
+                nn.LeakyReLU()
+            )
+            for i in range(len(self.kernel_sizes))
+        ])
+        self.hidden_dim = 64
+        self.gru = nn.GRU(self.out_chs*2, self.hidden_dim, num_layers=2, batch_first=True,
+                               bidirectional=True,dropout=0.25)
+        self.embed = nn.Embedding(self.vocab_size, self.embedding_size)
+        # self.encoder_layer = nn.TransformerEncoderLayer(d_model=self.embedding_size,nhead=self.num_heads,dropout=self.dropout)
+        # self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=1)
+        # self.MaxPool1d = nn.MaxPool1d(kernel_size=self.max_pool) # stride的默认值=kernel_size
+        self.pos_encoding = PositionalEncoding(num_hiddens=self.embedding_size,dropout=self.dropout)
+        self.attention_encode = AttentionEncode(self.dropout, self.embedding_size, self.num_heads,seq_len=self.seq_len,ffn=False)
+        shape = int(40*(64*2+64)) # +64
+        # self.fan = FAN_encode(self.dropout, shape)
+        z_dim = 1024
+        self.enc_mean = nn.Linear(shape,z_dim)
+        self.enc_std = nn.Linear(shape,z_dim)
+        self.dec = nn.Sequential(
+                                    nn.Linear(z_dim,128),
+                                    nn.BatchNorm1d(128),
+                                    nn.LeakyReLU(),
+                                    nn.Linear(128,self.output_size)
+        )
+        self.proj_layer = nn.Linear(self.embedding_size,self.out_chs)
+        self.fc = nn.Sequential(
+                                    nn.Linear(shape,z_dim),
+                                    nn.BatchNorm1d(z_dim),
+                                    nn.LeakyReLU(),
+                                    nn.Linear(z_dim,128),
+                                    nn.BatchNorm1d(128),
+                                    nn.LeakyReLU(),
+                                    nn.Linear(128,self.output_size)
+                                    )
+    def CNN1DNet(self,x):
+        for i in range(len(self.kernel_sizes)):
+            conv = self.all_conv[i]
+            conv_x = conv(x)
+            # conv_x = self.MaxPool1d(conv_x)
+            if i == 0:
+                all_feats = conv_x
+            else:
+                all_feats = torch.cat([all_feats,conv_x],dim=1)
+        return all_feats
+    def forward(self, x):
+        # x : [B,S=40]
+        # get esm embedding
+        with torch.no_grad():
+            results = self.ESMmodel(x, repr_layers=[self.layer_idx], return_contacts=False)
+        esm_x = results["representations"][self.layer_idx] #* 50 480 /640 /1280 # [B,S,480]
+        x = index_alignment(x,condition_num=1,subtraction_num1=3,subtraction_num2=1)
+        # feature A
+        embed_x = self.embed(x) # [batch_size,seq_len,embedding_size] c
+        pos_x = self.pos_encoding(embed_x * math.sqrt(self.embedding_size)) # [batch_size,seq_len,embedding_size]
+        encoding_x = pos_x # [B,S,480]
+        for _ in range(self.encoder_layer_num):
+            encoding_x = self.attention_encode(encoding_x)
+            encoding_x += embed_x
+        featA = encoding_x + esm_x
+        # feature B
+        pssm = seqs2blosum62(x).to(device) # B,S,20
+        featB = pssm.type_as(embed_x)
+        featAB = torch.cat([featA,featB],dim=2) # B,S,480+20
+        cnn_input = featAB.permute(0, 2, 1) # B,H,S
+        cnn_output = self.CNN1DNet(cnn_input) # B,out_chs*2,S
+        out = self.dropout_layer(cnn_output)
+        # out = self.dropout_layer(featA)
+        out = out.permute(0,2,1) # B,S,H:out_chs*2
+        out,_ = self.gru(out)
+        out = self.dropout_layer(out)
+        final_featAB = out.reshape(x.size(0),-1) # B,S*H:40*hidden_dim(64)*2
+        # feature C
+        featC = self.proj_layer(esm_x)
+        featC = self.dropout_layer(featC)
+        featC = featC.reshape(featC.shape[0],-1)
+        feat = torch.cat([final_featAB,featC],1) # B
+        final_feat = self.dropout_layer(feat) # B,S*(64*2+64)
+        # final_feat = final_featAB
+        # final_feat = featC
+        if self.info_bottleneck:
+            # ToxIBTL prediction head
+            enc_mean, enc_std = self.enc_mean(final_feat), f.softplus(self.enc_std(final_feat)-5)
+            eps = torch.randn_like(enc_std)
+            IB_out = enc_mean + enc_std*eps
+            logits = self.dec(IB_out)
+            return logits,enc_mean,enc_std
+            # return featA,featB,featAB,final_featAB,featC,enc_mean
+        else:
+            # 全连接层
+            logits = self.fc(final_feat)
+            return logits,logits,logits
+            # return featA,featB,featAB,final_featAB,featC,logits

DeepPD/predictor.py ADDED Viewed

	@@ -0,0 +1,26 @@

+from DeepPD.model import MyModel,DeepPD
+import torch
+import torch.nn as nn
+from DeepPD.config import ArgsConfig
+args = ArgsConfig()
+softmax = nn.Softmax(1)
+def predict(seqs,data,model_path,threshold=0.5, device=args.device):
+    with torch.no_grad():
+        model = DeepPD(vocab_size=21,embedding_size=args.embedding_size,esm_path=args.ems_path,layer_idx=args.esm_layer_idx,seq_len=args.max_len,dropout=args.dropout,
+               fan_layer_num=1,num_heads=8,encoder_layer_num=1,Contrastive_Learning=False,info_bottleneck=args.info_bottleneck).to(args.device)
+        model.eval()
+        state_dict = torch.load(model_path, map_location=device)
+        model.load_state_dict(state_dict,strict=False)
+        model.to(device)
+        seqs = seqs.to(device)
+        out,_,_ = model(seqs)
+        prob = softmax(out)[:,1]
+        final_out = []
+        for i, j in zip(data, prob):
+            temp = [i[0], i[1], f"{j:.3f}", 'Peptide' if j >threshold else 'Non-Peptide']
+            final_out.append(temp)
+    return final_out

DeepPD/utils.py ADDED Viewed

	@@ -0,0 +1,71 @@

+import torch
+import torch.nn as nn
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+class Res_Net(nn.Module):
+    def __init__(self,input_cha):
+        super(Res_Net,self).__init__()
+        self.conv1 = nn.Conv2d(input_cha,input_cha,3,padding=1)
+        self.conv2 = nn.Conv2d(input_cha,input_cha,5,padding=2)
+        self.conv3 = nn.Conv2d(input_cha,input_cha,7,padding=3)
+        self.cbamBlock = CBAMBlock(input_cha)
+        self.bn1 = nn.BatchNorm2d(input_cha)
+        self.relu1 = nn.ReLU()
+        self.relu2 = nn.LeakyReLU()
+    def forward(self,x):
+        init_x = x
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu2(out)
+        out = self.conv1(out)
+        out = self.bn1(out)
+        out += init_x
+        out = self.relu2(out)
+        return out
+class CBAMBlock(nn.Module):
+    def __init__(self, channel, reduction=16):
+        super(CBAMBlock, self).__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.max_pool = nn.AdaptiveMaxPool2d(1)
+        self.channel_excitation = nn.Sequential(nn.Linear(channel,int(channel//reduction),bias=False),
+                                                nn.ReLU(inplace=True),
+                                                nn.Linear(int(channel//reduction),channel,bias=False),
+                                                )
+        self.sigmoid = nn.Sigmoid()
+        self.spatial_excitation = nn.Sequential(nn.Conv2d(2, 1, kernel_size=7,
+                                                 stride=1, padding=3, bias=False),
+                                               )
+    def forward(self, x):
+        bahs, chs, _, _ = x.size()     #16 16 24 42
+        # Returns a new tensor with the same data as the self tensor but of a different size.
+        chn_avg = self.avg_pool(x).view(bahs, chs)
+        chn_avg = self.channel_excitation(chn_avg).view(bahs, chs, 1, 1)
+        chn_max = self.max_pool(x).view(bahs, chs)
+        chn_max = self.channel_excitation(chn_max).view(bahs, chs, 1, 1)
+        chn_add=chn_avg+chn_max
+        chn_add=self.sigmoid(chn_add)
+        chn_cbam = torch.mul(x, chn_add)
+        avg_out = torch.mean(chn_cbam, dim=1, keepdim=True)
+        max_out, _ = torch.max(chn_cbam, dim=1, keepdim=True)
+        cat = torch.cat([avg_out, max_out], dim=1)
+        spa_add = self.spatial_excitation(cat)
+        spa_add = self.sigmoid(spa_add)
+        spa_cbam = torch.mul(chn_cbam, spa_add)
+        return spa_cbam

DeepPD/utils_etfc.py ADDED Viewed

	@@ -0,0 +1,367 @@

+import math
+import torch
+from torch import nn
+class AddNorm(nn.Module):
+    """残差连接后进行层归一化"""
+    def __init__(self, normalized, dropout):
+        super(AddNorm, self).__init__()
+        self.dropout = nn.Dropout(dropout)
+        self.ln = nn.LayerNorm(normalized)
+    def forward(self, x, y):
+        return  self.ln(x + self.dropout(y))
+class PositionWiseFFN(nn.Module):
+    """基于位置的前馈⽹络"""
+    def __init__(self, ffn_input, ffn_hiddens,mlp_bias=True):
+        super(PositionWiseFFN, self).__init__()
+        self.ffn = nn.Sequential(
+            nn.Linear(ffn_input, ffn_hiddens, bias=mlp_bias),
+            nn.ReLU(),
+            nn.Linear(ffn_hiddens, ffn_input, bias=mlp_bias),
+        )
+    def forward(self, x):
+        return self.ffn(x)
+from torch.autograd import Variable
+class PositionalEncoding1(nn.Module):
+    "Implement the PE function."
+    def __init__(self, d_model, dropout, max_len=5000):
+        super(PositionalEncoding1, self).__init__()
+        self.dropout = nn.Dropout(p=dropout)
+        # Compute the positional encodings once in log space.
+        pe = torch.zeros(max_len, d_model)
+        position = torch.arange(0, max_len).unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, d_model, 2) *
+                             -(math.log(10000.0) / d_model))
+        pe[:, 0::2] = torch.sin(position * div_term)
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)
+        self.register_buffer('pe', pe)
+    def forward(self, x):
+        x = x + Variable(self.pe[:, :x.size(1)],
+                         requires_grad=False)
+        return self.dropout(x)
+class PositionalEncoding(nn.Module):
+    """位置编码"""
+    def __init__(self, num_hiddens, dropout, max_len=1000):
+        super(PositionalEncoding, self).__init__()
+        self.dropout = nn.Dropout(dropout)
+        # 创建⼀个⾜够⻓的P
+        self.P = torch.zeros((1, max_len, num_hiddens))
+        X = torch.arange(max_len, dtype=torch.float32).reshape(-1, 1) / torch.pow(10000, torch.arange(0, num_hiddens, 2,
+                                                                                                      dtype=torch.float32) / num_hiddens)
+        self.P[:, :, 0::2] = torch.sin(X)
+        self.P[:, :, 1::2] = torch.cos(X)
+    def forward(self, X):
+        X = X + self.P[:, :X.shape[1], :].to(X.device)
+        return self.dropout(X)
+class AttentionEncode(nn.Module):
+    def __init__(self, dropout, embedding_size, num_heads,seq_len: int=40,ffn=False):
+        super(AttentionEncode, self).__init__()
+        self.dropout = dropout
+        self.embedding_size = embedding_size
+        self.num_heads = num_heads
+        self.seq_len = seq_len
+        self.is_ffn = ffn
+        self.att = nn.MultiheadAttention(embed_dim=self.embedding_size,
+                                         num_heads=num_heads,
+                                         dropout=0.6
+                                         )
+        self.addNorm = AddNorm(normalized=[self.seq_len, self.embedding_size], dropout=self.dropout)
+        self.FFN = PositionWiseFFN(ffn_input=self.embedding_size, ffn_hiddens=self.embedding_size*2)
+    def forward(self, x):
+        bs,_,_ = x.size()
+        MHAtt, _ = self.att(x, x, x)
+        MHAtt_encode = self.addNorm(x, MHAtt)
+        if self.is_ffn:
+            ffn_in = MHAtt_encode # bs,seq_len,feat_dims
+            ffn_out = self.FFN(ffn_in)
+            MHAtt_encode = self.addNorm(ffn_in,ffn_out)
+        return MHAtt_encode
+class FAN_encode(nn.Module):
+    def __init__(self, dropout, shape):
+        super(FAN_encode, self).__init__()
+        self.dropout = dropout
+        self.addNorm = AddNorm(normalized=[1, shape], dropout=self.dropout)
+        self.FFN = PositionWiseFFN(ffn_input=shape, ffn_hiddens=(2*shape))
+        self.ln = nn.LayerNorm(shape)
+    def forward(self, x):
+        #x = self.ln(x)
+        ffn_out = self.FFN(x)
+        encode_output = self.addNorm(x, ffn_out)
+        return encode_output
+class ffn_norm(nn.Module):
+    # 可接受二维输入和一维输入
+    def __init__(self,input_dims:int,hidden_dims:int,dropout:float,bias:bool=True):
+        super(ffn_norm,self).__init__()
+        self.inps_dims = input_dims
+        self.hidden_dims = hidden_dims
+        self.dropout = nn.Dropout(dropout)
+        self.ffn_bias = bias
+        self.ffn = nn.Sequential(
+            nn.Linear(self.inps_dims, self.hidden_dims, bias=self.ffn_bias),
+            nn.LeakyReLU(),
+            nn.Linear(self.hidden_dims, self.inps_dims, bias=self.ffn_bias),
+        )
+        self.ln = nn.LayerNorm(self.inps_dims)
+    def forward(self,x):
+        # x:[B,S,H] OR [B,shape],shape:S*H
+        ffn_out = self.ffn(x)
+        norm_out = self.ln(x + self.dropout(ffn_out))
+        return norm_out
+def sequence_mask(X, valid_len, value=0.):
+    """在序列中屏蔽不相关的项"""
+    valid_len = valid_len.float()
+    MaxLen = X.size(1)
+    mask = torch.arange(MaxLen, dtype=torch.float32, device=X.device)[None, :] < valid_len[:, None].to(X.device)
+    X[~mask] = value
+    return X
+def masked_softmax(X, valid_lens):
+    """通过在最后⼀个轴上掩蔽元素来执⾏softmax操作"""
+    # X:3D张量，valid_lens:1D或2D张量
+    if valid_lens is None:
+        return nn.functional.softmax(X, dim=-1)
+    else:
+        shape = X.shape
+    if valid_lens.dim() == 1:
+        valid_lens = torch.repeat_interleave(valid_lens, shape[1])
+    else:
+        valid_lens = valid_lens.reshape(-1)  # 最后⼀轴上被掩蔽的元素使⽤⼀个⾮常⼤的负值替换，从⽽其softmax输出为0
+    X = sequence_mask(X.reshape(-1, shape[-1]), valid_lens, value=-1e6)
+    return nn.functional.softmax(X.reshape(shape), dim=-1)
+# class AdditiveAttention(nn.Module):
+#     """加性注意⼒"""
+#
+#     def __init__(self, key_size, query_size, num_hiddens, dropout):
+#         super(AdditiveAttention, self).__init__()
+#         self.W_k = nn.Linear(key_size, num_hiddens, bias=False)
+#         self.W_q = nn.Linear(query_size, num_hiddens, bias=False)
+#         self.w_v = nn.Linear(num_hiddens, 1, bias=False)
+#         self.dropout = nn.Dropout(dropout)
+#
+#     def forward(self, queries, keys, values, valid_lens):
+#         queries, keys = self.W_q(queries), self.W_k(keys)
+#         # 在维度扩展后，
+#         # queries的形状：(batch_size，查询的个数，1，num_hidden)
+#         # key的形状：(batch_size，1，“键－值”对的个数，num_hiddens)
+#         # 使⽤⼴播⽅式进⾏求和
+#         features = queries.unsqueeze(2) + keys.unsqueeze(1)
+#         features = torch.tanh(features)
+#         # self.w_v仅有⼀个输出，因此从形状中移除最后那个维度。
+#         # scores的形状：(batch_size，查询的个数，“键-值”对的个数)
+#         scores = self.w_v(features).squeeze(-1)
+#         attention_weights = masked_softmax(scores, valid_lens)
+#         # values的形状：(batch_size，“键－值”对的个数，值的维度)
+#         return torch.bmm(self.dropout(attention_weights), values)
+class AdditiveAttention(nn.Module):
+    """注意⼒机制"""
+    def __init__(self, input_size, value_size, num_hiddens, dropout):
+        super(AdditiveAttention, self).__init__()
+        self.W_k = nn.Linear(input_size, num_hiddens, bias=False)
+        self.W_q = nn.Linear(input_size, num_hiddens, bias=False)
+        self.w_v = nn.Linear(input_size, num_hiddens, bias=False)
+        self.w_o = nn.Linear(50, value_size, bias=False)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, queries, keys, values, valid_lens=None):
+        queries, keys = self.W_q(queries), self.W_k(keys)
+        d = queries.shape[-1]
+        # 在维度扩展后，
+        # queries的形状：(batch_size，查询的个数，1，num_hidden)
+        # key的形状：(batch_size，1，“键－值”对的个数，num_hiddens)
+        # 使⽤⼴播⽅式进⾏求和
+        # features = queries + keys
+        # features = torch.tanh(features)
+        # self.w_v仅有⼀个输出，因此从形状中移除最后那个维度。
+        # scores的形状：(batch_size，查询的个数，“键-值”对的个数)
+        scores = torch.bmm(queries, keys.transpose(1, 2)) / math.sqrt(d)
+        scores = self.w_o(scores).permute(0, 2, 1)
+        attention_weights = masked_softmax(scores, valid_lens)
+        # attention_weights = nn.Softmax(dim=1)(scores)
+        values = self.w_v(values)
+        # values = torch.transpose(values, 1, 2)
+        # values的形状：(batch_size，“键－值”对的个数，值的维度)
+        return torch.bmm(self.dropout(attention_weights), values), attention_weights
+class MultiHeadAttention(nn.Module):
+    """多头注意力"""
+    def __init__(self, key_size, query_size, value_size, num_hiddens,
+                 num_heads, dropout, bias=False):
+        super(MultiHeadAttention, self).__init__()
+        self.num_heads = num_heads
+        self.attention = DotProductAttention(dropout)
+        self.W_q = nn.Linear(query_size, num_hiddens, bias=bias)
+        self.W_k = nn.Linear(key_size, num_hiddens, bias=bias)
+        self.W_v = nn.Linear(value_size, num_hiddens, bias=bias)
+        self.W_o = nn.Linear(num_hiddens, num_hiddens, bias=bias)
+    def forward(self, queries, keys, values, valid_lens=None):
+        # queries，keys，values的形状:
+        # (batch_size，查询或者“键－值”对的个数，num_hiddens)
+        # valid_lens　的形状:
+        # (batch_size，)或(batch_size，查询的个数)
+        # 经过变换后，输出的queries，keys，values　的形状:
+        # (batch_size*num_heads，查询或者“键－值”对的个数，
+        # num_hiddens/num_heads)
+        queries = transpose_qkv(self.W_q(queries), self.num_heads)
+        keys = transpose_qkv(self.W_k(keys), self.num_heads)
+        values = transpose_qkv(self.W_v(values), self.num_heads)
+        if valid_lens is not None:
+            # 在轴0，将第一项（标量或者矢量）复制num_heads次，
+            # 然后如此复制第二项，然后诸如此类。
+            valid_lens = torch.repeat_interleave(valid_lens, repeats=self.num_heads, dim=0)
+        # output的形状:(batch_size*num_heads，查询的个数，num_hiddens/num_heads)
+        output = self.attention(queries, keys, values, valid_lens)
+        # output_concat的形状:(batch_size，查询的个数，num_hiddens)
+        output_concat = transpose_output(output, self.num_heads)
+        return self.W_o(output_concat)
+def transpose_qkv(X, num_heads):
+    """为了多注意力头的并行计算而变换形状"""
+    # 输入X的形状:(batch_size，查询或者“键－值”对的个数，num_hiddens)
+    # 输出X的形状:(batch_size，查询或者“键－值”对的个数，num_heads，
+    # num_hiddens/num_heads)
+    X = X.reshape(X.shape[0], X.shape[1], num_heads, -1)
+    # 输出X的形状:(batch_size，num_heads，查询或者“键－值”对的个数,
+    # num_hiddens/num_heads)
+    X = X.permute(0, 2, 1, 3)
+    # 最终输出的形状:(batch_size*num_heads,查询或者“键－值”对的个数,
+    # num_hiddens/num_heads)
+    return X.reshape(-1, X.shape[2], X.shape[3])
+def transpose_output(X, num_heads):
+    """逆转transpose_qkv函数的操作"""
+    X = X.reshape(-1, num_heads, X.shape[1], X.shape[2])
+    X = X.permute(0, 2, 1, 3)
+    return X.reshape(X.shape[0], X.shape[1], -1)
+class DotProductAttention(nn.Module):
+    """缩放点积注意力"""
+    def __init__(self, dropout):
+        super(DotProductAttention, self).__init__()
+        self.dropout = nn.Dropout(dropout)
+    # queries的形状：(batch_size，查询的个数，d)
+    # keys的形状：(batch_size，“键－值”对的个数，d)
+    # values的形状：(batch_size，“键－值”对的个数，值的维度)
+    # valid_lens的形状:(batch_size，)或者(batch_size，查询的个数)
+    def forward(self, queries, keys, values, valid_lens=None):
+        d = queries.shape[-1]
+        # 设置transpose_b=True为了交换keys的最后两个维度
+        scores = torch.bmm(queries, keys.transpose(1, 2)) / math.sqrt(d)
+        attention_weights = masked_softmax(scores, valid_lens)
+        return torch.bmm(self.dropout(attention_weights), values)
+class MASK_AttentionEncode(nn.Module):
+    def __init__(self, dropout, embedding_size, num_heads):
+        super(MASK_AttentionEncode, self).__init__()
+        self.dropout = dropout
+        self.embedding_size = embedding_size
+        self.num_heads = num_heads
+        self.at1 = MultiHeadAttention(key_size=self.embedding_size,
+                                      query_size=self.embedding_size,
+                                      value_size=self.embedding_size,
+                                      num_hiddens=self.embedding_size,
+                                      num_heads=self.num_heads,
+                                      dropout=self.dropout)
+        self.addNorm = AddNorm(normalized=[50, self.embedding_size], dropout=self.dropout)
+        self.FFN = PositionWiseFFN(ffn_num_input=64, ffn_num_hiddens=192, ffn_num_outputs=64)
+    def forward(self, x, y=None):
+        # Multi, _ = self.at1(x, x, x)
+        Multi = self.at1(x, x, x, y)
+        Multi_encode = self.addNorm(x, Multi)
+        # encode_output = self.addNorm(Multi_encode, self.FFN(Multi_encode))
+        return Multi_encode
+class transformer_encode(nn.Module):
+    def __init__(self, dropout, embedding, num_heads):
+        super(transformer_encode, self).__init__()
+        self.dropout = dropout
+        self.embedding_size = embedding
+        self.num_heads = num_heads
+        self.attention = nn.MultiheadAttention(embed_dim=192,
+                                               num_heads=8,
+                                               dropout=0.6
+                                               )
+        self.at1 = MultiHeadAttention(key_size=self.embedding_size,
+                                      query_size=self.embedding_size,
+                                      value_size=self.embedding_size,
+                                      num_hiddens=self.embedding_size,
+                                      num_heads=self.num_heads,
+                                      dropout=self.dropout)
+        self.addNorm = AddNorm(normalized=[50, self.embedding_size], dropout=self.dropout)
+        self.ffn = PositionWiseFFN(ffn_num_input=self.embedding_size, ffn_num_hiddens=2*self.embedding_size,
+                                   ffn_num_outputs=self.embedding_size)
+    def forward(self, x, valid=None):
+        # Multi, _ = self.attention(x, x, x)
+        Multi = self.at1(x, x, x, valid)
+        Multi_encode = self.addNorm(x, Multi)
+        encode_output = self.addNorm(Multi_encode, self.ffn(Multi_encode))
+        return encode_output

app.ipynb ADDED Viewed

	@@ -0,0 +1,205 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from Bio import SeqIO\n",
+    "from DeepPD.data_helper import Data2EqlTensor,Seqs2EqlTensor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "('LLSEVEELNMSLTALREK', 18)"
+      ]
+     },
+     "execution_count": 2,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "file_path = './homo_test.fa'\n",
+    "data = []\n",
+    "for record in SeqIO.parse(file_path, 'fasta'):\n",
+    "    data.append((record.id, str(record.seq)))\n",
+    "\n",
+    "data[0][1],len(data[0][1])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "default_padding_value: 1\n",
+      "length>40: 0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([6, 40])"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "seqs,ids = Data2EqlTensor(data,40)\n",
+    "seqs.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([[ 4,  4,  8,  9,  7,  9,  9,  4, 17, 20,  8,  4, 11,  5,  4, 10,  9, 15,\n",
+       "          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
+       "          1,  1,  1,  1],\n",
+       "        [11,  5, 21, 19,  6,  8,  4, 14, 16, 15,  8, 21,  6, 10,  1,  1,  1,  1,\n",
+       "          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
+       "          1,  1,  1,  1],\n",
+       "        [ 7, 17, 18, 21, 18, 12,  4, 18, 17, 17,  7, 13,  6, 21,  4, 19,  9,  4,\n",
+       "         13,  6, 10,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
+       "          1,  1,  1,  1],\n",
+       "        [17, 16, 22, 16,  4,  8,  5, 13, 13,  4, 15, 15,  1,  1,  1,  1,  1,  1,\n",
+       "          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
+       "          1,  1,  1,  1],\n",
+       "        [ 7,  4,  7,  5,  4, 19,  9,  9, 14,  9, 15, 14, 17,  8,  5,  4, 13, 18,\n",
+       "          4, 15,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
+       "          1,  1,  1,  1],\n",
+       "        [16,  5, 11, 11, 12, 12,  5, 13, 17, 12, 12, 18,  4,  8, 13, 16, 11, 15,\n",
+       "          9, 15,  9,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
+       "          1,  1,  1,  1]])"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "seqs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Some weights of the model checkpoint at ./DeepPD/BERT were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']\n",
+      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
+     ]
+    }
+   ],
+   "source": [
+    "from DeepPD.predictor import predict\n",
+    "import torch"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
+    "\n",
+    "def homo_classifier(file,threshold):\n",
+    "    data = []\n",
+    "    for record in SeqIO.parse(file, 'fasta'):\n",
+    "        data.append((record.id, str(record.seq)))\n",
+    "    seqs,ids = Data2EqlTensor(data,40)\n",
+    "    homo_peptide_pred = predict(seqs,data, './weight-Homo/4.pth', threshold, device)\n",
+    "    return homo_peptide_pred"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "default_padding_value: 1\n",
+      "length>40: 0\n"
+     ]
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[['peptide_1', 'LLSEVEELNMSLTALREK', '0.296', 'Non-Peptide'],\n",
+       " ['peptide_2', 'TAHYGSLPQKSHGR', '0.013', 'Non-Peptide'],\n",
+       " ['peptide_3', 'VNFHFILFNNVDGHLYELDGR', '0.809', 'Peptide'],\n",
+       " ['peptide_4', 'NQWQLSADDLKK', '0.827', 'Peptide'],\n",
+       " ['peptide_5', 'VLVALYEEPEKPNSALDFLK', '0.868', 'Peptide'],\n",
+       " ['peptide_6', 'QATTIIADNIIFLSDQTKEKE', '0.043', 'Non-Peptide']]"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "out = homo_classifier(file_path,0.5)\n",
+    "out"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "env3.8",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.0"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

app.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import torch
+from DeepPD.predictor import predict
+from DeepPD.data_helper import Data2EqlTensor
+import gradio as gr
+from Bio import SeqIO
+device = "cuda" if torch.cuda.is_available() else "cpu"
+def mus_classifier(file,threshold):
+    data = []
+    for record in SeqIO.parse(file.name, 'fasta'):
+        data.append((record.id, str(record.seq)))
+    seqs,_ = Data2EqlTensor(data,40)
+    mus_peptide_pred = predict(seqs,data, './weight-Mus/4.pth', threshold, device)
+    return mus_peptide_pred
+def homo_classifier(file,threshold):
+    data = []
+    for record in SeqIO.parse(file.name, 'fasta'):
+        data.append((record.id, str(record.seq)))
+    seqs,_ = Data2EqlTensor(data,40)
+    homo_peptide_pred = predict(seqs, data, './weight-Homo/4.pth', threshold, device)
+    return homo_peptide_pred
+# {peptide_id:[Type:int(1->peptide,0->non-peptide)]}
+with gr.Blocks() as demo:
+    gr.Markdown(" ## DeepPD")
+    gr.Markdown("In this study, we developed a peptide detectability prediction model. The model was used to predict the probability that an amino acid sequence is a peptide.")
+    with gr.Tab("Prediction Model(Homo sapiens)"):
+        with gr.Row():
+            with gr.Column(scale=2):
+                input_fasta_homo = gr.File()
+            with gr.Column(scale=2):
+                homo_cutoff = gr.Slider(0, 1, step=0.1, value=0.5, interactive=True, label="Threshold")
+                gr.Markdown("### Note")
+                gr.Markdown("- Limit the number of input sequences to less than 128.")
+                gr.Markdown("- The file should be the Fasta format.")
+                gr.Markdown("- We used only the first 20 amino acids of each N-terminal and C-terminal of the sequence for prediction.")
+                image_button_homo = gr.Button("Submit")
+        with gr.Column():
+            # gr.Markdown(" ### Flip text or image files using this demo.")
+            gr.Markdown("Note: the output scores indicates the probability of the input sequence to be predicted as a Peptide or a Non-Peptide.")
+            frame_homo_output = gr.DataFrame(
+                headers=["Sequence Id", "Sequence", "Probability of peptides", "Peptide"],
+                datatype=["str", "str", "str", 'str'],)
+    image_button_homo.click(homo_classifier, inputs=[input_fasta_homo, homo_cutoff], outputs=frame_homo_output)
+    with gr.Tab("Prediction Model(Mus musculus)"):
+        # cutoff = gr.Slider(0, 1, step=0.1, value=0.5, interactive=True)
+        with gr.Row():
+            with gr.Column(scale=2):
+                input_fasta_mus = gr.File()
+                # cutoff = gr.Slider(0, 1, step=0.1, value=0.5, interactive=True, label="threshold")
+                # image_button = gr.Button("Submit")
+            with gr.Column(scale=2):
+                mus_cutoff = gr.Slider(0, 1, step=0.1, value=0.5, interactive=True, label="Threshold")
+                gr.Markdown("### Note")
+                gr.Markdown("- Limit the number of input sequences to less than 128.")
+                gr.Markdown("- The file should be the Fasta format.")
+                gr.Markdown("- We used only the first 20 amino acids of each N-terminal and C-terminal of the sequence for prediction.")
+                image_button_mus = gr.Button("Submit")
+        with gr.Column():
+            # gr.Markdown(" ### Flip text or image files using this demo.")
+            gr.Markdown("Note: the output scores indicates the probability of the input sequence to be predicted as a Peptide or a Non-Peptide.")
+            frame_mus_output = gr.DataFrame(
+                headers=["Sequence Id", "Sequence", "Probability of peptides", "Peptide"],
+                datatype=["str", "str", "str", 'str'],)
+    image_button_mus.click(mus_classifier, inputs=[input_fasta_mus, mus_cutoff], outputs=frame_mus_output)
+    with gr.Accordion("Citation"):
+        gr.Markdown("- GitHub: https://github.com/leonern/DeepPD")
+    with gr.Accordion("License"):
+        gr.Markdown("- Released under the [MIT license](https://github.com/leonern/DeepPD/blob/main/LICENSE). ")
+    with gr.Accordion("Contact"):
+        gr.Markdown("- If you have any questions, please file a Github issue or contact me at 107552103310@stu.xju.edu.cn")
+demo.queue(4)
+demo.launch() #share=True

homo_test.fa ADDED Viewed

	@@ -0,0 +1,12 @@

+>peptide_1
+LLSEVEELNMSLTALREK
+>peptide_2
+TAHYGSLPQKSHGR
+>peptide_3
+VNFHFILFNNVDGHLYELDGR
+>peptide_4
+NQWQLSADDLKK
+>peptide_5
+VLVALYEEPEKPNSALDFLK
+>peptide_6
+QATTIIADNIIFLSDQTKEKE

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+biopython==1.81
+fair_esm==2.0.0
+numpy==1.22.3
+torch
+transformers==4.25.1
+gradio==3.30.0
+Bio==1.5.9

weight-Homo/4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:220e8f5094004e171951d84665f1728fe4a206a7447427bbd4db08bb4df3ca18
+size 239141411

weight-Mus/4.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:22a162acab7dd8fa9b2e496edf833fb641fad0de22c97d9f6008fd7865a6a2b6
+size 239141411