xiaoleon commited on
Commit
2d48951
·
1 Parent(s): 8f8219a

initial submission

Browse files
DeepMFPP/ESM2/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/tmp/facebook/esm2_t12_35M_UR50D",
3
+ "architectures": [
4
+ "EsmForMaskedLM"
5
+ ],
6
+ "attention_probs_dropout_prob": 0.0,
7
+ "classifier_dropout": null,
8
+ "emb_layer_norm_before": false,
9
+ "esmfold_config": null,
10
+ "hidden_act": "gelu",
11
+ "hidden_dropout_prob": 0.0,
12
+ "hidden_size": 480,
13
+ "initializer_range": 0.02,
14
+ "intermediate_size": 1920,
15
+ "is_folding_model": false,
16
+ "layer_norm_eps": 1e-05,
17
+ "mask_token_id": 32,
18
+ "max_position_embeddings": 1026,
19
+ "model_type": "esm",
20
+ "num_attention_heads": 20,
21
+ "num_hidden_layers": 12,
22
+ "pad_token_id": 1,
23
+ "position_embedding_type": "rotary",
24
+ "token_dropout": true,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.25.0.dev0",
27
+ "use_cache": true,
28
+ "vocab_list": null,
29
+ "vocab_size": 33
30
+ }
DeepMFPP/ESM2/esm2_t12_35M_UR50D-contact-regression.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:16641e05d830d0ce863dd152dbb8c2f3ddfa3c3ec2a66080152c8abad01d8585
3
+ size 1959
DeepMFPP/ESM2/esm2_t12_35M_UR50D.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7f21e80e61d16a71735163ef555d3009afb0c98da74c48e29df08606973cc55e
3
+ size 134095705
DeepMFPP/ESM2/model_index.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_class_name": "StableDiffusionPipeline",
3
+ "_diffusers_version": "0.8.0",
4
+ "feature_extractor": [
5
+ "transformers",
6
+ "CLIPFeatureExtractor"
7
+ ],
8
+ "safety_checker": [
9
+ null,
10
+ null
11
+ ],
12
+ "scheduler": [
13
+ "diffusers",
14
+ "DDIMScheduler"
15
+ ],
16
+ "text_encoder": [
17
+ "transformers",
18
+ "CLIPTextModel"
19
+ ],
20
+ "tokenizer": [
21
+ "transformers",
22
+ "CLIPTokenizer"
23
+ ],
24
+ "unet": [
25
+ "diffusers",
26
+ "UNet2DConditionModel"
27
+ ],
28
+ "vae": [
29
+ "diffusers",
30
+ "AutoencoderKL"
31
+ ],
32
+ "requires_safety_checker": false
33
+ }
DeepMFPP/ESM2/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "<cls>",
3
+ "eos_token": "<eos>",
4
+ "mask_token": "<mask>",
5
+ "pad_token": "<pad>",
6
+ "unk_token": "<unk>"
7
+ }
DeepMFPP/ESM2/tokenizer_config.json ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ "model_max_length": 1000000000000000019884624838656,
3
+ "tokenizer_class": "EsmTokenizer"
4
+ }
DeepMFPP/ESM2/vocab.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <cls>
2
+ <pad>
3
+ <eos>
4
+ <unk>
5
+ L
6
+ A
7
+ G
8
+ V
9
+ S
10
+ E
11
+ R
12
+ T
13
+ I
14
+ D
15
+ P
16
+ K
17
+ Q
18
+ N
19
+ F
20
+ Y
21
+ M
22
+ H
23
+ W
24
+ C
25
+ X
26
+ B
27
+ U
28
+ Z
29
+ O
30
+ .
31
+ -
32
+ <null_1>
33
+ <mask>
DeepMFPP/__pycache__/config.cpython-38.pyc ADDED
Binary file (1.27 kB). View file
 
DeepMFPP/__pycache__/data_helper.cpython-38.pyc ADDED
Binary file (4.68 kB). View file
 
DeepMFPP/__pycache__/model.cpython-38.pyc ADDED
Binary file (3.75 kB). View file
 
DeepMFPP/__pycache__/predictor.cpython-38.pyc ADDED
Binary file (2.62 kB). View file
 
DeepMFPP/__pycache__/utils.cpython-38.pyc ADDED
Binary file (3.77 kB). View file
 
DeepMFPP/__pycache__/utils_etfc.cpython-38.pyc ADDED
Binary file (10.7 kB). View file
 
DeepMFPP/config.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+
4
+ class ArgsConfig:
5
+ def __init__(self) -> None:
6
+ self.batch_size = 192
7
+ self.embedding_size = 480
8
+ self.epochs = 100
9
+ self.kflod = 5
10
+ self.max_len = 51
11
+ self.lr = 1.8e-3
12
+ self.weight_decay = 0
13
+ self.dropout = 0.6
14
+ self.ctl = False
15
+
16
+ self.margin = 2.8
17
+ self.scale_factor = 1
18
+ self.training_ratio = 1.1
19
+ self.model_name = 'DeepMFPP-MFTP'
20
+ self.loss_fn_name = 'MLFDL'
21
+ self.exp_nums = None
22
+ self.aa_dict = 'esm' # 'protbert' /'esm'/ None
23
+ self.class_weight = False
24
+ # self.lr_step_size = 250
25
+ # self.lr_milestones = [20,60,120,180,240]
26
+ # self.lr_gamma = 0.75
27
+ self.fldl_clip_pos = 0.7
28
+ self.fldl_clip_neg = 0.5
29
+ self.fldl_pos_weight = 0.4
30
+ self.info = f"FDL{0.7,0.5,0.3},CosLR,cw={self.class_weight}" #对当前训练做的补充说明
31
+
32
+ # self.data_dir = './data/AllData.txt'
33
+ # self.train_data_dir = './data/MFTP-Data/train.txt' # MLBP-Data/train.txt MFTP-Data/train.txt
34
+ # self.test_data_dir = './data/MFTP-Data/test.txt' # MLBP-Data/test.txt MFTP-Data/test.txt
35
+ # MLBP-Data/train_0.5_min-2_maj-1.txt
36
+ # MFTP-Data/traindata_da/train_rs_2.txt
37
+ # self.train_data_da_dir = './data/MFTP-Data/traindata_da/train_rs_2.txt'
38
+ # self.ebv_dir = './eq_21_21.pkl'
39
+ self.use_ebv = False
40
+
41
+ # self.log_dir = './result/logs'
42
+ # self.save_dir = './result/model_para'
43
+ # self.tensorboard_log_dir = './tensorboard'
44
+ self.ems_path = './DeepMFPP/ESM2/esm2_t12_35M_UR50D.pt'
45
+ self.esm_layer_idx = 12
46
+ # self.save_para_dir = os.path.join(self.save_dir,self.model_name)
47
+ self.random_seed = 2023
48
+ self.num_classes = 21
49
+ self.split_size = 0.8
50
+ # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
51
+ self.device = torch.device("cpu")
52
+ self.continue_training = False
53
+ # self.checkpoint_path = r'result\model_para\CNN_BIGRU_test1\1.pth'
54
+
55
+ # if not os.path.exists(self.log_dir):
56
+ # os.mkdir(self.log_dir)
57
+ # if not os.path.exists(self.save_dir):
58
+ # os.mkdir(self.save_dir)
59
+ # if not os.path.exists(self.save_para_dir):
60
+ # os.mkdir(self.save_para_dir)
61
+ # if not os.path.exists(self.tensorboard_log_dir):
62
+ # os.mkdir(self.tensorboard_log_dir)
63
+
DeepMFPP/data_helper.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import torch
3
+ import torch.nn.utils.rnn as rnn_utils
4
+
5
+ def Data2EqlTensor(lines,max_len:int=51,AminoAcid_vocab=None):
6
+ '''
7
+ Args:
8
+ flie:文件路径 \n
9
+ max_len:设定转换后的氨基酸序列最大长度 \n
10
+ vocab_dict:esm or protbert ,默认为按顺序映射的词典
11
+ '''
12
+ # 只保留20种氨基酸和填充数,其余几种非常规氨基酸均用填充数代替
13
+ # 使用 esm和portbert字典时,nn.embedding()的vocab_size = 25
14
+ if AminoAcid_vocab =='esm':
15
+ aa_dict = {'[PAD]': 1, 'L': 4, 'A': 5, 'G': 6, 'V': 7, 'S': 8, 'E': 9, 'R': 10, 'T': 11, 'I': 12, 'D': 13, 'P': 14, 'K': 15, 'Q': 16,
16
+ 'N': 17, 'F': 18, 'Y': 19, 'M': 20, 'H': 21, 'W': 22, 'C': 23, 'X': 1, 'B': 1, 'U': 1, 'Z': 1, 'O': 1}
17
+ elif AminoAcid_vocab == 'protbert':
18
+ aa_dict = {'[PAD]':0,'L': 5, 'A': 6, 'G': 7, 'V': 8, 'E': 9, 'S': 10, 'I': 11, 'K': 12, 'R': 13, 'D': 14, 'T': 15,
19
+ 'P': 16, 'N': 17, 'Q': 18, 'F': 19, 'Y': 20, 'M': 21, 'H': 22, 'C': 23, 'W': 24, 'X': 0, 'U': 0, 'B': 0, 'Z': 0, 'O': 0}
20
+ else:
21
+ aa_dict = {'[PAD]':0,'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,
22
+ 'S':16,'T':17,'V':18,'W':19,'Y':20,'U':0,'X':0,'J':0}
23
+ ## Esm vocab
24
+ ## protbert vocab
25
+
26
+ padding_key = '[PAD]'
27
+ default_padding_value = 0
28
+ if padding_key in aa_dict:
29
+ dict_padding_value = aa_dict.get('[PAD]')
30
+ else:
31
+ dict_padding_value = default_padding_value
32
+ print(f"No padding value in the implicit dictionary, set to {default_padding_value} by default")
33
+
34
+ # assert len(lines) % 2 == 0, "Invalid file format. Number of lines should be even."
35
+
36
+ long_pep_counter = 0
37
+ pep_codes = []
38
+ labels = []
39
+ ids = []
40
+ pad_flag = 1
41
+ for id,pep in lines:
42
+ ids.append(id)
43
+ x = len(pep)
44
+
45
+ if x < max_len:
46
+ current_pep=[]
47
+ for aa in pep:
48
+ if aa.upper() in aa_dict.keys():
49
+ current_pep.append(aa_dict[aa.upper()])
50
+ # 将第一个长度<max_len的序列填充到40,确保当输入序列均<max_len时,所有序列仍然能够填充到max_len
51
+ if pad_flag:
52
+ current_pep.extend([dict_padding_value] * (max_len - len(current_pep)))
53
+ pad_flag = 0
54
+ pep_codes.append(torch.tensor(current_pep)) # torch.tensor(current_pep)
55
+ else:
56
+ pep_head = pep[0:int(max_len/2)]
57
+ pep_tail = pep[int(x-int(max_len/2)):int(x)]
58
+ new_pep = pep_head+pep_tail
59
+ current_pep=[]
60
+ for aa in new_pep:
61
+ current_pep.append(aa_dict[aa])
62
+ pep_codes.append(torch.tensor(current_pep))
63
+ long_pep_counter += 1
64
+
65
+ print("length > {}:{}".format(max_len,long_pep_counter))
66
+ data = rnn_utils.pad_sequence(pep_codes,batch_first=True,padding_value=dict_padding_value)
67
+ return data,torch.tensor(labels)
68
+
69
+ def SeqsData2EqlTensor(file_path:str,max_len:int,AminoAcid_vocab=None):
70
+ '''
71
+ Args:
72
+ flie:文件路径 \n
73
+ max_len:设定转换后的氨基酸序列最大长度 \n
74
+ vocab_dict:esm or protbert ,默认为按顺序映射的词典
75
+ '''
76
+ # 只保留20种氨基酸和填充数,其余几种非常规氨基酸均用填充数代替
77
+ # 使用 esm和portbert字典时,nn.embedding()的vocab_size = 25
78
+ if AminoAcid_vocab =='esm':
79
+ aa_dict = {'[PAD]': 1, 'L': 4, 'A': 5, 'G': 6, 'V': 7, 'S': 8, 'E': 9, 'R': 10, 'T': 11, 'I': 12, 'D': 13, 'P': 14, 'K': 15, 'Q': 16,
80
+ 'N': 17, 'F': 18, 'Y': 19, 'M': 20, 'H': 21, 'W': 22, 'C': 23, 'X': 1, 'B': 1, 'U': 1, 'Z': 1, 'O': 1}
81
+ elif AminoAcid_vocab == 'protbert':
82
+ aa_dict = {'[PAD]':0,'L': 5, 'A': 6, 'G': 7, 'V': 8, 'E': 9, 'S': 10, 'I': 11, 'K': 12, 'R': 13, 'D': 14, 'T': 15,
83
+ 'P': 16, 'N': 17, 'Q': 18, 'F': 19, 'Y': 20, 'M': 21, 'H': 22, 'C': 23, 'W': 24, 'X': 0, 'U': 0, 'B': 0, 'Z': 0, 'O': 0}
84
+ else:
85
+ aa_dict = {'[PAD]':0,'A':1,'C':2,'D':3,'E':4,'F':5,'G':6,'H':7,'I':8,'K':9,'L':10,'M':11,'N':12,'P':13,'Q':14,'R':15,
86
+ 'S':16,'T':17,'V':18,'W':19,'Y':20,'U':0,'X':0,'J':0}
87
+ ## Esm vocab
88
+ ## protbert vocab
89
+
90
+ padding_key = '[PAD]'
91
+ default_padding_value = 0
92
+ if padding_key in aa_dict:
93
+ dict_padding_value = aa_dict.get('[PAD]')
94
+ else:
95
+ dict_padding_value = default_padding_value
96
+ print(f"No padding value in the implicit dictionary, set to {default_padding_value} by default")
97
+
98
+ with open(file_path, 'r') as inf:
99
+ lines = inf.read().splitlines()
100
+ assert len(lines) % 2 == 0, "Invalid file format. Number of lines should be even."
101
+
102
+ long_pep_counter=0
103
+ pep_codes=[]
104
+ labels=[]
105
+ for line in lines:
106
+ if line[0] == '>':
107
+ labels.append([int(i) for i in line[1:]])
108
+ else:
109
+ x = len(line)
110
+
111
+ if x < max_len:
112
+ current_pep=[]
113
+ for aa in line:
114
+ if aa.upper() in aa_dict.keys():
115
+ current_pep.append(aa_dict[aa.upper()])
116
+ pep_codes.append(torch.tensor(current_pep)) #torch.tensor(current_pep)
117
+ else:
118
+ pep_head = line[0:int(max_len/2)]
119
+ pep_tail = line[int(x-int(max_len/2)):int(x)]
120
+ new_pep = pep_head+pep_tail
121
+ current_pep=[]
122
+ for aa in new_pep:
123
+ current_pep.append(aa_dict[aa])
124
+ pep_codes.append(torch.tensor(current_pep))
125
+ long_pep_counter += 1
126
+
127
+ print("length > {}:{}".format(max_len,long_pep_counter))
128
+ data = rnn_utils.pad_sequence(pep_codes,batch_first=True,padding_value=dict_padding_value)
129
+ return data,torch.tensor(labels)
130
+
131
+ def index_alignment(batch,condition_num=0,subtraction_num1=4,subtraction_num2=1):
132
+ '''将其他蛋白质语言模型的字典索引和默认字典索引进行对齐,保持氨基酸索引只有20个数构成,且范围在[1,20],[PAD]=0或者1 \n
133
+ "esm"模型,condition_num=1,subtraction_num1=3,subtraction_num2=1; \n
134
+ "protbert"模型,condition_num=0,subtraction_num1=4
135
+
136
+ Args:
137
+ batch:形状为[batch_size,seq_len]的二维张量 \n
138
+ condition_num:字典中的[PAD]值 \n
139
+ subtraction_num1:对齐非[PAD]元素所需减掉的差值 \n
140
+ subtraction_num2:对齐[PAD]元素所需减掉的差值
141
+
142
+ return:
143
+ shape:[batch_size,seq_len],dtype=tensor.
144
+ '''
145
+ condition = batch == condition_num
146
+ # 创建一个张量,形状和batch相同,表示非[PAD]元素要减去的值
147
+ subtraction = torch.full_like(batch, subtraction_num1)
148
+ if condition_num==0:
149
+ # 使用torch.where()函数来选择batch中为0的元素或者batch减去subtraction中的元素
150
+ output = torch.where(condition, batch, batch - subtraction)
151
+ elif condition_num==1:
152
+ # 创建一个张量,形状和batch相同,表示[PAD]元素要减去的值
153
+ subtraction_2 = torch.full_like(batch, subtraction_num2)
154
+ output = torch.where(condition, batch-subtraction_2, batch - subtraction)
155
+
156
+ return output
DeepMFPP/model.py ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import esm,math
3
+ import torch.nn as nn
4
+ import torch.nn.functional as F
5
+ from DeepMFPP.utils import PositionalEncoding,FAN_encode
6
+ from DeepMFPP.data_helper import index_alignment
7
+
8
+ class DeepMFPP(nn.Module):
9
+ def __init__(self, vocab_size: int, embedding_size: int, fan_layer_num: int=1, num_heads: int=8, encoder_layer_num: int = 1,
10
+ output_size: int = 21, layer_idx=None, esm_path=None, dropout: float = 0.6, max_pool=5, Contrastive_Learning=False):
11
+ super(DeepMFPP,self).__init__()
12
+
13
+ self.vocab_size = vocab_size
14
+ self.embedding_size = embedding_size
15
+ self.output_size = output_size
16
+ self.dropout = dropout
17
+ self.dropout_layer = nn.Dropout(self.dropout)
18
+ self.encoder_layer_num = encoder_layer_num
19
+ self.fan_layer_num = fan_layer_num
20
+ self.num_heads = num_heads
21
+ self.max_pool = max_pool
22
+ self.ctl = Contrastive_Learning
23
+ self.ffn_size = self.embedding_size*2
24
+ self.dropout_layer1 = nn.Dropout(0.4)
25
+
26
+ self.ESMmodel,_ = esm.pretrained.load_model_and_alphabet_local(esm_path)
27
+ self.ESMmodel.eval()
28
+ self.layer_idx = layer_idx
29
+
30
+ self.out_chs = 64
31
+ final_feats_shape = self.out_chs*50
32
+ self.embedding = nn.Embedding(self.vocab_size, self.embedding_size)
33
+ self.pos_encoding = PositionalEncoding(num_hiddens=self.embedding_size,dropout=self.dropout)
34
+ # self.attention_encode = AttentionEncode(self.dropout, self.embedding_size, self.num_heads,ffn=False)
35
+
36
+ self.ffn = nn.Sequential(
37
+ nn.Linear(self.embedding_size, self.embedding_size*2, bias=True),
38
+ nn.GELU(),
39
+ # nn.LeakyReLU(),
40
+ nn.Linear(self.embedding_size*2, self.embedding_size, bias=True),
41
+ )
42
+ self.ln1 = nn.LayerNorm(self.embedding_size)
43
+ self.softmax = nn.Softmax(dim=-1)
44
+ self.W_o = nn.Linear(self.embedding_size,self.embedding_size)
45
+
46
+ self.kernel_sizes = [3,5,7,11,15]
47
+ self.MaxPool1d = nn.MaxPool1d(kernel_size=self.max_pool)
48
+ self.all_conv = nn.ModuleList([
49
+ nn.Sequential(
50
+ nn.Conv1d(self.embedding_size,out_channels=self.out_chs,kernel_size=self.kernel_sizes[i],padding=(self.kernel_sizes[i]-1)//2),
51
+ nn.BatchNorm1d(self.out_chs),
52
+ nn.LeakyReLU()
53
+ )
54
+ for i in range(len(self.kernel_sizes))
55
+ ])
56
+
57
+ # self.project_layer =nn.Linear(self.embedding_size,64)
58
+ self.fan = FAN_encode(self.dropout, final_feats_shape)
59
+ self.proj_layer = nn.Sequential( nn.Linear(final_feats_shape,1280),
60
+ nn.BatchNorm1d(1280),
61
+ nn.LeakyReLU(),
62
+ nn.Linear(1280,128)
63
+ )
64
+ self.fc = nn.Sequential(
65
+ nn.BatchNorm1d(128),
66
+ nn.LeakyReLU(),
67
+ nn.Linear(128,self.output_size)
68
+ )
69
+
70
+ def CNN1DNet(self,x):
71
+
72
+ for i in range(len(self.kernel_sizes)):
73
+ conv = self.all_conv[i]
74
+ conv_x = conv(x)
75
+ conv_x = self.MaxPool1d(conv_x)
76
+ if i == 0:
77
+ all_feats = conv_x
78
+ else:
79
+ all_feats = torch.cat([all_feats,conv_x],dim=-1)
80
+
81
+ return all_feats
82
+
83
+ def forward(self, x):
84
+ B,S = x.shape
85
+ H = self.embedding_size
86
+
87
+ # --- ESM layer ----
88
+ with torch.no_grad():
89
+ results = self.ESMmodel(x, repr_layers=[self.layer_idx], return_contacts=False)
90
+ esm_x = results["representations"][self.layer_idx] #* 50 480 /640 /1280
91
+
92
+ # --- feature A Embedding+PE layer ----
93
+ index_ali_x = index_alignment(x,condition_num=1,subtraction_num1=3,subtraction_num2=1)
94
+ embedding_x = self.embedding(index_ali_x) # [batch_size,seq_len,embedding_size]
95
+ pos_x = self.pos_encoding(embedding_x * math.sqrt(self.embedding_size)) # [batch_size,seq_len,embedding_size]
96
+ feats1 = pos_x
97
+ # feats1 = embedding_x
98
+ # feats_fuse = feats1
99
+
100
+ # for _ in range(self.encoder_layer_num):
101
+ # feats1 = self.attention_encode(feats1)
102
+ # feats1 += embedding_x # B,S,H
103
+ # feats1 += esm_x
104
+
105
+ feats2 = esm_x
106
+ # feats_fuse = feats2
107
+
108
+ # # --- Self-attention feature fuse ---
109
+ d = feats1.size(-1)
110
+ q,k = feats1, feats2
111
+ v = feats1 + feats2 #+ esm_x
112
+ feats_qk = q @ k.transpose(-1, -2)*math.sqrt(d)
113
+ feats_qk = self.softmax(feats_qk)
114
+ feats_v = feats_qk @ v
115
+ # 线性变换投影到输出向量空间
116
+ feats_v = self.W_o(feats_v) # [B,S,H]
117
+ ffn_y = self.ffn(self.ln1(feats_v)) # 这两行的结构好像只能这样写
118
+ feats_fuse = v + self.dropout_layer(ffn_y)
119
+ # feats_fuse = feats1 + feats2
120
+ # feats_final = self.dropout_layer(self.project_layer(feats_fuse))
121
+
122
+ # # --- 1DCNN layer ---
123
+ cnn_input = feats_fuse
124
+ cnn_input = cnn_input.permute(0, 2, 1) # [B,H,S]
125
+ feats3 = self.CNN1DNet(cnn_input) # [B,F,S] F:out_chas
126
+ feats3 = self.dropout_layer(feats3)
127
+ feats_final = feats3
128
+
129
+ # --- FFN layer ---
130
+ fan_input = feats_final.view(x.size(0),-1) # B,seq_len*feat_dim:50*64
131
+ fan_input = fan_input.unsqueeze(1) # B,1,seq_len*feat_dim:50*64 AddNorm中的normalized=[1, shape]
132
+ for _ in range(self.fan_layer_num):
133
+ fan_encode = self.fan(fan_input)
134
+ fan_out = fan_encode.squeeze(1)
135
+ # fan_out = fan_input.squeeze(1)
136
+
137
+ # --- CLSFC layer ---
138
+ hidden = self.proj_layer(fan_out)
139
+ logits = self.fc(hidden)
140
+
141
+ # return feats1,feats2,feats_fuse,feats_final,fan_out,hidden,logits
142
+ return hidden,logits
DeepMFPP/predictor.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from DeepMFPP.model import DeepMFPP
2
+ import torch
3
+ import torch.nn as nn
4
+ import numpy as np
5
+ from DeepMFPP.config import ArgsConfig
6
+
7
+ args = ArgsConfig()
8
+ args.embedding_size = 480
9
+ args.aa_dict = 'esm'
10
+ args.loss_fn_name = 'MLFDL'
11
+ args.weight_decay = 0
12
+ args.batch_size = 192
13
+ args.dropout = 0.62
14
+ args.scale_factor = 100
15
+ args.fldl_pos_weight = 0.4
16
+
17
+ sigmoid = nn.Sigmoid()
18
+ def predict(seqs:torch.Tensor,data:list,model_path:str, top_k:int=0,threshold:float=0.5, device=args.device):
19
+ torch.manual_seed(args.random_seed)
20
+ with torch.no_grad():
21
+ model = DeepMFPP(vocab_size=21,embedding_size=args.embedding_size, encoder_layer_num=1, fan_layer_num=1, num_heads=8,output_size=args.num_classes,
22
+ esm_path=args.ems_path,layer_idx=args.esm_layer_idx,dropout=args.dropout,Contrastive_Learning=args.ctl).to(args.device)
23
+ model.eval()
24
+ state_dict = torch.load(model_path, map_location=device)
25
+ model.load_state_dict(state_dict,strict=False)
26
+ model.to(args.device)
27
+ # print(device)
28
+ seqs.to(args.device)
29
+ _, logits = model(seqs)
30
+ prob = sigmoid(logits)
31
+ # logits = np.round(logits.cpu().numpy(),3)
32
+ # prob = np.round(prob.cpu().numpy(),3)
33
+ # logits = logits.cpu().numpy()
34
+ prob = prob.cpu().numpy()
35
+ # print(logits)
36
+ # print(prob)
37
+ categories = ['AAP', 'ABP', 'ACP', 'ACVP','ADP', 'AEP', 'AFP', 'AHIVP', 'AHP', 'AIP', 'AMRSAP',
38
+ 'APP', 'ATP', 'AVP', 'BBP', 'BIP', 'CPP', 'DPPIP', 'QSP', 'SBP', 'THP']
39
+ final_out = []
40
+ for i, j, k in zip(data, logits, prob):
41
+ temp = [i[0], i[1]] # , f"logits:{j}", f"probability:{k}"
42
+
43
+ # 过滤概率值大于阈值的预测结果
44
+ result_dict = {}
45
+ for label, p in zip(categories, k):
46
+ # print(p)
47
+ if p > threshold:
48
+ result_dict[label] = round(float(p), 4)
49
+
50
+ # 返回概率值大于阈值的字典对
51
+ # 示例: {'AVP': 0.567, 'ATP': 0.678, ...}
52
+ if result_dict:
53
+ sorted_result = {k: v for k, v in sorted(result_dict.items(), key=lambda item: item[1], reverse=True)}
54
+ else:
55
+ sorted_result = {}
56
+ # print(sorted_result)
57
+
58
+ if top_k:
59
+ sorted_items_list = list(sorted_result.items())
60
+ top_k_result = dict(sorted_items_list[:top_k])
61
+ top_k_result_str = ", ".join(f"{key}: {value}" for key, value in top_k_result.items())
62
+ temp.extend([top_k_result_str])
63
+
64
+ else:
65
+ sorted_result_str = ", ".join(f"{key}: {value}" for key, value in sorted_result.items())
66
+ temp.extend([sorted_result_str])
67
+
68
+ final_out.append(temp)
69
+
70
+ return final_out
DeepMFPP/utils.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch.nn as nn
2
+ import torch
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+
6
+ # transformer modules
7
+ class AddNorm(nn.Module):
8
+ """残差连接后进行层归一化"""
9
+
10
+ def __init__(self, normalized, dropout):
11
+ super(AddNorm, self).__init__()
12
+ self.dropout = nn.Dropout(dropout)
13
+ self.ln = nn.LayerNorm(normalized)
14
+
15
+ def forward(self, x, y):
16
+ return self.ln(x + self.dropout(y))
17
+
18
+
19
+ class PositionWiseFFN(nn.Module):
20
+ """基于位置的前馈⽹络"""
21
+
22
+ def __init__(self, ffn_input, ffn_hiddens,mlp_bias=True):
23
+ super(PositionWiseFFN, self).__init__()
24
+ self.ffn = nn.Sequential(
25
+ nn.Linear(ffn_input, ffn_hiddens, bias=mlp_bias),
26
+ nn.ReLU(),
27
+ nn.Linear(ffn_hiddens, ffn_input, bias=mlp_bias),
28
+ )
29
+
30
+ def forward(self, x):
31
+ return self.ffn(x)
32
+
33
+ class PositionalEncoding(nn.Module):
34
+ """位置编码"""
35
+
36
+ def __init__(self, num_hiddens, dropout, max_len=1000):
37
+ super(PositionalEncoding, self).__init__()
38
+ self.dropout = nn.Dropout(dropout)
39
+ # 创建⼀个⾜够⻓的P
40
+ self.P = torch.zeros((1, max_len, num_hiddens))
41
+ X = torch.arange(max_len, dtype=torch.float32).reshape(-1, 1) / torch.pow(10000, torch.arange(0, num_hiddens, 2,
42
+ dtype=torch.float32) / num_hiddens)
43
+ self.P[:, :, 0::2] = torch.sin(X)
44
+ self.P[:, :, 1::2] = torch.cos(X)
45
+
46
+ def forward(self, X):
47
+ X = X + self.P[:, :X.shape[1], :].to(X.device)
48
+ return self.dropout(X)
49
+
50
+ class AttentionEncode(nn.Module):
51
+
52
+ def __init__(self, dropout, embedding_size, num_heads,ffn=False):
53
+ super(AttentionEncode, self).__init__()
54
+ self.dropout = dropout
55
+ self.embedding_size = embedding_size
56
+ self.num_heads = num_heads
57
+ self.seq_len = 50
58
+ self.is_ffn = ffn
59
+
60
+ self.att = nn.MultiheadAttention(embed_dim=self.embedding_size,
61
+ num_heads=num_heads,
62
+ dropout=0.6
63
+ )
64
+
65
+ self.addNorm = AddNorm(normalized=[self.seq_len, self.embedding_size], dropout=self.dropout)
66
+
67
+ self.FFN = PositionWiseFFN(ffn_input=self.embedding_size, ffn_hiddens=self.embedding_size*2)
68
+
69
+ def forward(self, x):
70
+ bs,_,_ = x.size()
71
+ MHAtt, _ = self.att(x, x, x)
72
+ MHAtt_encode = self.addNorm(x, MHAtt)
73
+
74
+ if self.is_ffn:
75
+ ffn_in = MHAtt_encode # bs,seq_len,feat_dims
76
+ ffn_out = self.FFN(ffn_in)
77
+ MHAtt_encode = self.addNorm(ffn_in,ffn_out)
78
+
79
+ return MHAtt_encode
80
+
81
+ class FAN_encode(nn.Module):
82
+
83
+ def __init__(self, dropout, shape):
84
+ super(FAN_encode, self).__init__()
85
+ self.dropout = dropout
86
+ self.addNorm = AddNorm(normalized=[1, shape], dropout=self.dropout)
87
+ self.FFN = PositionWiseFFN(ffn_input=shape, ffn_hiddens=(2*shape))
88
+ self.ln = nn.LayerNorm(shape)
89
+
90
+ def forward(self, x):
91
+ #x = self.ln(x)
92
+ ffn_out = self.FFN(x)
93
+ encode_output = self.addNorm(x, ffn_out)
94
+
95
+ return encode_output
README.md CHANGED
@@ -1,10 +1,10 @@
1
  ---
2
- title: DeepMFPP
3
- emoji: 🏢
4
  colorFrom: gray
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 4.13.0
8
  app_file: app.py
9
  pinned: false
10
  license: mit
 
1
  ---
2
+ title: DeepPD
3
+ emoji: 🐠
4
  colorFrom: gray
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 3.33.1
8
  app_file: app.py
9
  pinned: false
10
  license: mit
app.ipynb ADDED
@@ -0,0 +1,322 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "from Bio import SeqIO\n",
10
+ "from DeepMFPP.data_helper import Data2EqlTensor"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "metadata": {},
17
+ "outputs": [
18
+ {
19
+ "data": {
20
+ "text/plain": [
21
+ "('ARRRRCSDRFRNCPADEALCGRRRR', 25)"
22
+ ]
23
+ },
24
+ "execution_count": 2,
25
+ "metadata": {},
26
+ "output_type": "execute_result"
27
+ }
28
+ ],
29
+ "source": [
30
+ "\"\"\"\n",
31
+ ">000000000000000010000\n",
32
+ ">010000000010000000000\n",
33
+ ">010001000010000000000\n",
34
+ ">011000000001000000000\n",
35
+ ">100000000000000000000\n",
36
+ "\"\"\"\n",
37
+ "\n",
38
+ "file_path = './test_samples.fa'\n",
39
+ "data = []\n",
40
+ "for record in SeqIO.parse(file_path, 'fasta'):\n",
41
+ " data.append((record.id, str(record.seq)))\n",
42
+ "\n",
43
+ "data[0][1],len(data[0][1])"
44
+ ]
45
+ },
46
+ {
47
+ "cell_type": "code",
48
+ "execution_count": 3,
49
+ "metadata": {},
50
+ "outputs": [
51
+ {
52
+ "data": {
53
+ "text/plain": [
54
+ "[('peptide_1', 'ARRRRCSDRFRNCPADEALCGRRRR'),\n",
55
+ " ('peptide_2', 'FFHHIFRGIVHVGKTIHKLVTGT'),\n",
56
+ " ('peptide_3', 'GLRKRLRKFRNKIKEKLKKIGQKIQGFVPKLAPRTDY'),\n",
57
+ " ('peptide_4', 'FLGALWNVAKSVF'),\n",
58
+ " ('peptide_5', 'KIKSCYYLPCFVTS')]"
59
+ ]
60
+ },
61
+ "execution_count": 3,
62
+ "metadata": {},
63
+ "output_type": "execute_result"
64
+ }
65
+ ],
66
+ "source": [
67
+ "data"
68
+ ]
69
+ },
70
+ {
71
+ "cell_type": "code",
72
+ "execution_count": 4,
73
+ "metadata": {},
74
+ "outputs": [
75
+ {
76
+ "name": "stdout",
77
+ "output_type": "stream",
78
+ "text": [
79
+ "length > 50:0\n"
80
+ ]
81
+ },
82
+ {
83
+ "data": {
84
+ "text/plain": [
85
+ "torch.Size([5, 50])"
86
+ ]
87
+ },
88
+ "execution_count": 4,
89
+ "metadata": {},
90
+ "output_type": "execute_result"
91
+ }
92
+ ],
93
+ "source": [
94
+ "seqs,ids = Data2EqlTensor(data,50)\n",
95
+ "seqs.shape"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {},
102
+ "outputs": [],
103
+ "source": [
104
+ "seqs"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "code",
109
+ "execution_count": 25,
110
+ "metadata": {},
111
+ "outputs": [],
112
+ "source": [
113
+ "from DeepMFPP.model import DeepMFPP\n",
114
+ "import torch\n",
115
+ "import torch.nn as nn\n",
116
+ "import numpy as np\n",
117
+ "from DeepMFPP.config import ArgsConfig\n",
118
+ "\n",
119
+ "args = ArgsConfig()\n",
120
+ "args.embedding_size = 480\n",
121
+ "args.aa_dict = 'esm'\n",
122
+ "args.loss_fn_name = 'MLFDL'\n",
123
+ "args.weight_decay = 0\n",
124
+ "args.batch_size = 192\n",
125
+ "args.dropout = 0.62\n",
126
+ "args.scale_factor = 100\n",
127
+ "args.fldl_pos_weight = 0.4\n",
128
+ "\n",
129
+ "sigmoid = nn.Sigmoid()\n",
130
+ "def predict(seqs:torch.Tensor,data:list,model_path:str, top_k:int=0,threshold:float=0.5, device=args.device):\n",
131
+ " torch.manual_seed(args.random_seed)\n",
132
+ " with torch.no_grad():\n",
133
+ " model = DeepMFPP(vocab_size=21,embedding_size=args.embedding_size, encoder_layer_num=1, fan_layer_num=1, num_heads=8,output_size=args.num_classes,\n",
134
+ " esm_path=args.ems_path,layer_idx=args.esm_layer_idx,dropout=args.dropout,Contrastive_Learning=args.ctl).to(args.device)\n",
135
+ " model.eval()\n",
136
+ " state_dict = torch.load(model_path, map_location=device)\n",
137
+ " model.load_state_dict(state_dict,strict=False)\n",
138
+ " model.to(args.device)\n",
139
+ " # print(device)\n",
140
+ " seqs.to(args.device)\n",
141
+ " _, logits = model(seqs)\n",
142
+ " prob = sigmoid(logits)\n",
143
+ " # logits = np.round(logits.cpu().numpy(),3)\n",
144
+ " # prob = np.round(prob.cpu().numpy(),3)\n",
145
+ " # logits = logits.cpu().numpy()\n",
146
+ " prob = prob.cpu().numpy()\n",
147
+ " # print(logits)\n",
148
+ " # print(prob)\n",
149
+ " categories = ['AAP', 'ABP', 'ACP', 'ACVP','ADP', 'AEP', 'AFP', 'AHIVP', 'AHP', 'AIP', 'AMRSAP', \n",
150
+ " 'APP', 'ATP', 'AVP', 'BBP', 'BIP', 'CPP', 'DPPIP', 'QSP', 'SBP', 'THP']\n",
151
+ " final_out = []\n",
152
+ " for i, j, k in zip(data, logits, prob):\n",
153
+ " temp = [i[0], i[1]] # , f\"logits:{j}\", f\"probability:{k}\"\n",
154
+ " \n",
155
+ " # 过滤概率值大于阈值的预测结果\n",
156
+ " result_dict = {}\n",
157
+ " for label, p in zip(categories, k):\n",
158
+ " # print(p)\n",
159
+ " if p > threshold:\n",
160
+ " result_dict[label] = round(float(p), 4)\n",
161
+ " \n",
162
+ " # 返回概率值大于阈值的字典对\n",
163
+ " # 示例: {'AVP': 0.567, 'ATP': 0.678, ...}\n",
164
+ " if result_dict:\n",
165
+ " sorted_result = {k: v for k, v in sorted(result_dict.items(), key=lambda item: item[1], reverse=True)}\n",
166
+ " else:\n",
167
+ " sorted_result = {}\n",
168
+ " # print(sorted_result)\n",
169
+ "\n",
170
+ " # 选择概率值最高的 top_k 个预测结果\n",
171
+ " if top_k: \n",
172
+ " sorted_items_list = list(sorted_result.items())\n",
173
+ " top_k_result = dict(sorted_items_list[:top_k])\n",
174
+ " top_k_result_str = \", \".join(f\"{key}: {value}\" for key, value in top_k_result.items())\n",
175
+ " temp.extend([top_k_result_str])\n",
176
+ " \n",
177
+ " else:\n",
178
+ " sorted_result_str = \", \".join(f\"{key}: {value}\" for key, value in sorted_result.items())\n",
179
+ " temp.extend([sorted_result_str])\n",
180
+ " \n",
181
+ " final_out.append(temp)\n",
182
+ " \n",
183
+ " return final_out"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": 26,
189
+ "metadata": {},
190
+ "outputs": [],
191
+ "source": [
192
+ "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
193
+ "\n",
194
+ "def MFPs_classifier(file:str,threshold:float=0.5,top_k=0):\n",
195
+ " data = []\n",
196
+ " for record in SeqIO.parse(file, 'fasta'):\n",
197
+ " data.append((record.id, str(record.seq)))\n",
198
+ " seqs,_ = Data2EqlTensor(data,51,AminoAcid_vocab=args.aa_dict)\n",
199
+ " model_weight_path = './weight/DeepMFPP-Best.pth'\n",
200
+ " MFPs_pred = predict(seqs=seqs, data=data, model_path=model_weight_path, threshold=threshold,top_k=top_k,device=device)\n",
201
+ " \n",
202
+ " return MFPs_pred"
203
+ ]
204
+ },
205
+ {
206
+ "cell_type": "code",
207
+ "execution_count": 27,
208
+ "metadata": {},
209
+ "outputs": [
210
+ {
211
+ "name": "stdout",
212
+ "output_type": "stream",
213
+ "text": [
214
+ "length > 51:0\n"
215
+ ]
216
+ },
217
+ {
218
+ "data": {
219
+ "text/plain": [
220
+ "[['peptide_1',\n",
221
+ " 'ARRRRCSDRFRNCPADEALCGRRRR',\n",
222
+ " 'ADP: 0.5739, BBP: 0.5358, AAP: 0.5204, AIP: 0.5153, DPPIP: 0.5056, ABP: 0.5'],\n",
223
+ " ['peptide_2',\n",
224
+ " 'FFHHIFRGIVHVGKTIHKLVTGT',\n",
225
+ " 'ADP: 0.5633, AIP: 0.5371, BBP: 0.5369, AAP: 0.5235, DPPIP: 0.5084, SBP: 0.5065, AHP: 0.5059, APP: 0.5027, ACP: 0.502'],\n",
226
+ " ['peptide_3',\n",
227
+ " 'GLRKRLRKFRNKIKEKLKKIGQKIQGFVPKLAPRTDY',\n",
228
+ " 'AAP: 0.5418, ADP: 0.5346, ABP: 0.5167, DPPIP: 0.5162, AIP: 0.5081, QSP: 0.5047, APP: 0.5034'],\n",
229
+ " ['peptide_4',\n",
230
+ " 'FLGALWNVAKSVF',\n",
231
+ " 'ADP: 0.5684, BBP: 0.5619, AHP: 0.5381, AAP: 0.5319, AIP: 0.5189, ACP: 0.5124, QSP: 0.5104, SBP: 0.5059, DPPIP: 0.5012'],\n",
232
+ " ['peptide_5',\n",
233
+ " 'KIKSCYYLPCFVTS',\n",
234
+ " 'ADP: 0.5862, BBP: 0.5636, ACP: 0.5271, AHP: 0.5266, AIP: 0.5244, AAP: 0.5149, DPPIP: 0.5111, QSP: 0.5093, APP: 0.5074']]"
235
+ ]
236
+ },
237
+ "execution_count": 27,
238
+ "metadata": {},
239
+ "output_type": "execute_result"
240
+ }
241
+ ],
242
+ "source": [
243
+ "out = MFPs_classifier(file_path,threshold=0.5,top_k=0)\n",
244
+ "out"
245
+ ]
246
+ },
247
+ {
248
+ "cell_type": "code",
249
+ "execution_count": 15,
250
+ "metadata": {},
251
+ "outputs": [
252
+ {
253
+ "data": {
254
+ "text/plain": [
255
+ "tensor(0.4508)"
256
+ ]
257
+ },
258
+ "execution_count": 15,
259
+ "metadata": {},
260
+ "output_type": "execute_result"
261
+ }
262
+ ],
263
+ "source": [
264
+ "x = -0.1974\n",
265
+ "sigmoid(torch.tensor(x))"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "code",
270
+ "execution_count": 23,
271
+ "metadata": {},
272
+ "outputs": [
273
+ {
274
+ "name": "stdout",
275
+ "output_type": "stream",
276
+ "text": [
277
+ "[('ADP', 0.5739), ('BBP', 0.5358), ('AAP', 0.5204), ('AIP', 0.5153), ('DPPIP', 0.5056), ('ABP', 0.5001)] 6\n",
278
+ "{'ADP': 0.5739, 'BBP': 0.5358, 'AAP': 0.5204}\n"
279
+ ]
280
+ }
281
+ ],
282
+ "source": [
283
+ "original_dict = {'ADP': 0.5739, 'BBP': 0.5358, 'AAP': 0.5204, 'AIP': 0.5153, 'DPPIP': 0.5056, 'ABP': 0.5001}\n",
284
+ "n = 3 # 要保留的键值对数量\n",
285
+ "\n",
286
+ "sliced_items = list(original_dict.items())\n",
287
+ "print(sliced_items,len(sliced_items))\n",
288
+ "sliced_dict = dict(sliced_items[:n])\n",
289
+ "print(sliced_dict)"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": null,
295
+ "metadata": {},
296
+ "outputs": [],
297
+ "source": []
298
+ }
299
+ ],
300
+ "metadata": {
301
+ "kernelspec": {
302
+ "display_name": "env3.8",
303
+ "language": "python",
304
+ "name": "python3"
305
+ },
306
+ "language_info": {
307
+ "codemirror_mode": {
308
+ "name": "ipython",
309
+ "version": 3
310
+ },
311
+ "file_extension": ".py",
312
+ "mimetype": "text/x-python",
313
+ "name": "python",
314
+ "nbconvert_exporter": "python",
315
+ "pygments_lexer": "ipython3",
316
+ "version": "3.8.0"
317
+ },
318
+ "orig_nbformat": 4
319
+ },
320
+ "nbformat": 4,
321
+ "nbformat_minor": 2
322
+ }
app.py ADDED
@@ -0,0 +1,72 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from DeepMFPP.predictor import predict
3
+ from DeepMFPP.data_helper import Data2EqlTensor
4
+ import gradio as gr
5
+ from Bio import SeqIO
6
+ device = "cuda" if torch.cuda.is_available() else "cpu"
7
+
8
+ def MFPs_classifier(file:str,threshold:float=0.5,top_k=0):
9
+ data = []
10
+ for record in SeqIO.parse(file.name, 'fasta'): # notebook测试时 for record in SeqIO.parse(file, 'fasta')
11
+ data.append((record.id, str(record.seq)))
12
+ seqs,_ = Data2EqlTensor(data,51,AminoAcid_vocab='esm')
13
+ model_weight_path = './weight/DeepMFPP-Best.pth'
14
+ MFPs_pred = predict(seqs=seqs, data=data, model_path=model_weight_path, threshold=threshold,top_k=top_k,device=device)
15
+
16
+ return MFPs_pred
17
+ """
18
+ [['peptide_1', 'ARRRRCSDRFRNCPADEALCGRRRR'],
19
+ {'ADP': 0.5739, 'BBP': 0.5358},
20
+ ['peptide_2', 'FFHHIFRGIVHVGKTIHKLVTGT'],
21
+ {'ADP': 0.5633, 'AIP': 0.5371},
22
+ ['peptide_3', 'GLRKRLRKFRNKIKEKLKKIGQKIQGFVPKLAPRTDY'],
23
+ {'AAP': 0.5418, 'ADP': 0.5346},
24
+ ['peptide_4', 'FLGALWNVAKSVF'],
25
+ {'ADP': 0.5684, 'BBP': 0.5619},
26
+ ['peptide_5', 'KIKSCYYLPCFVTS'],
27
+ {'ADP': 0.5862, 'BBP': 0.5636}]
28
+ """
29
+
30
+ with gr.Blocks() as demo:
31
+ gr.Markdown(" ## DeepMFPP")
32
+ gr.Markdown("In this study, we developed a multi-functional peptides(MFPs) prediction model. The model was used to predict the \
33
+ functional labels (21 categories of MFPs involved in our study) that peptide sequences have.")
34
+
35
+ with gr.Tab("MFPs Prediction Model"):
36
+ with gr.Row():
37
+ with gr.Column(scale=2):
38
+ input_fasta = gr.File()
39
+ with gr.Column(scale=2):
40
+ cutoff = gr.Slider(0, 1, step=0.01, value=0.5, interactive=True, label="Threshold")
41
+ top_k = gr.Slider(0, 21, step=1, value=0, interactive=True, label="top_k")
42
+
43
+ gr.Markdown("### Note")
44
+ gr.Markdown("- Limit the number of input sequences to less than 128.")
45
+ gr.Markdown("- If top_k is set to 0, the combination of all probability values (processed by the sigmoid function) \
46
+ larger than the threshold is returned by default. Otherwise, the specified top k predictions are returned")
47
+ gr.Markdown("- The file should be the Fasta format.")
48
+ gr.Markdown("- If the length of the sequence is less than 50, use 0 to fill to 50; if the length is over 50, \
49
+ We used only the first 25 amino acids of each N-terminal and C-terminal of the sequence for prediction.")
50
+ image_button_MFPs = gr.Button("Submit")
51
+ with gr.Column():
52
+ # gr.Markdown(" ### Flip text or image files using this demo.")
53
+ gr.Markdown("Note: The predicted probabilities are processed by sigmoid function")
54
+ MFPs_output = gr.DataFrame(
55
+ headers=["Sequence Id", "Sequence", "Categories and probabilities"],
56
+ datatype=["str", "str", "str"],)
57
+
58
+
59
+ image_button_MFPs.click(MFPs_classifier, inputs=[input_fasta, cutoff,top_k], outputs=MFPs_output)
60
+
61
+ with gr.Accordion("Citation"):
62
+ gr.Markdown("- GitHub: https://github.com/leonern/DeepMFPP-GitHub")
63
+
64
+ with gr.Accordion("License"):
65
+ gr.Markdown("- Released under the [MIT license](https://github.com/leonern/DeepMFPP-GitHub/blob/main/LICENSE). ")
66
+
67
+ with gr.Accordion("Contact"):
68
+ gr.Markdown("- If you have any questions, please file a Github issue or contact me at 107552103310@stu.xju.edu.cn")
69
+
70
+
71
+ demo.queue(4)
72
+ demo.launch() #share=True
requirements.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ torch
2
+ numpy
3
+ biopython
4
+ transformers
5
+ gradio
6
+ Bio
7
+ fair-esm
test_samples.fa ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ >peptide_1
2
+ ARRRRCSDRFRNCPADEALCGRRRR
3
+ >peptide_2
4
+ FFHHIFRGIVHVGKTIHKLVTGT
5
+ >peptide_3
6
+ GLRKRLRKFRNKIKEKLKKIGQKIQGFVPKLAPRTDY
7
+ >peptide_4
8
+ FLGALWNVAKSVF
9
+ >peptide_5
10
+ KIKSCYYLPCFVTS
weight/DeepMFPP-Best.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:39768e826608e7d2c1fc3eacb1d740c5c60fc55ec6fc29fa3ef080d7d3ce3b46
3
+ size 706172953