fondress commited on
Commit
a959f5c
·
verified ·
1 Parent(s): f161845

Create processing_pdeeppp.py

Browse files
Files changed (1) hide show
  1. processing_pdeeppp.py +128 -0
processing_pdeeppp.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import torch.nn as nn
4
+ import numpy as np
5
+ from processing_pdeeppp import PDeepPPProcessor
6
+ from sklearn.model_selection import train_test_split
7
+ import esm
8
+
9
+ # 设置设备
10
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
11
+ print(f"Using device: {device}")
12
+
13
+ # 设置超参数
14
+ batch_size = 16
15
+ embedding_dim = 1280
16
+ esm_ratio = 0.95
17
+ target_length = 33 # PDeepPPProcessor 的目标序列长度
18
+ ptm_type = "Hydroxyproline_P"
19
+ save_dir = f"./pretrained_weights/{ptm_type}/"
20
+ os.makedirs(save_dir, exist_ok=True)
21
+
22
+ # 加载数据集
23
+ data_path = "/path/to/your/dataset.xlsx" # 替换为你的数据集路径
24
+ data = pd.read_excel(data_path)
25
+ labels = data["label"].values
26
+ sequences = data["sequence"].fillna("").values
27
+
28
+ # 数据集划分
29
+ train_sequences, test_sequences, train_labels, test_labels = train_test_split(
30
+ sequences, labels, test_size=0.2, random_state=42
31
+ )
32
+
33
+ # 初始化 PDeepPPProcessor
34
+ processor = PDeepPPProcessor(pad_char="X", target_length=target_length)
35
+
36
+ # 处理训练和测试数据
37
+ train_inputs = processor(sequences=train_sequences, ptm_mode=True)
38
+ test_inputs = processor(sequences=test_sequences, ptm_mode=True)
39
+
40
+ # 加载 ESM 模型
41
+ esm_model, esm_alphabet = esm.pretrained.esm2_t33_650M_UR50D()
42
+ batch_converter = esm_alphabet.get_batch_converter()
43
+ esm_model = esm_model.to(device)
44
+ esm_model.eval()
45
+
46
+ def extract_esm_representations(sequences, batch_size=16):
47
+ """从 ESM 模型中提取序列表示"""
48
+ sequence_representations = []
49
+ for i in range(0, len(sequences), batch_size):
50
+ batch_data = sequences[i : i + batch_size]
51
+ batch_labels = [0] * len(batch_data) # 占位符标签
52
+ batch = list(zip(batch_labels, batch_data))
53
+ _, _, batch_tokens = batch_converter(batch)
54
+ batch_tokens = batch_tokens.to(device)
55
+ with torch.no_grad():
56
+ results = esm_model(batch_tokens, repr_layers=[33])
57
+ token_representations = results["representations"][33]
58
+ for seq, token_repr in zip(batch_data, token_representations):
59
+ seq_len = len(seq)
60
+ seq_repr = token_repr[1 : seq_len + 1] # 去掉起始和结束标记
61
+ if seq_len < target_length:
62
+ padding = torch.zeros(target_length - seq_len, embedding_dim).to(device)
63
+ seq_repr = torch.cat((seq_repr, padding), dim=0)
64
+ sequence_representations.append(seq_repr)
65
+ return torch.stack(sequence_representations)
66
+
67
+ # 提取 ESM 表示
68
+ print("Extracting ESM representations for training data...")
69
+ train_esm_representations = extract_esm_representations(train_sequences, batch_size=batch_size)
70
+ print("Extracting ESM representations for testing data...")
71
+ test_esm_representations = extract_esm_representations(test_sequences, batch_size=batch_size)
72
+
73
+ # 定义嵌入模型
74
+ class EmbeddingPretrainedModel(nn.Module):
75
+ def __init__(self, vocab_size, embedding_dim, max_len):
76
+ super(EmbeddingPretrainedModel, self).__init__()
77
+ self.embedding = nn.Embedding(vocab_size, embedding_dim)
78
+ self.fc = nn.Linear(embedding_dim, embedding_dim)
79
+
80
+ def forward(self, x):
81
+ x = self.embedding(x)
82
+ x = self.fc(x)
83
+ return x
84
+
85
+ # 构建词汇表
86
+ vocab = set("".join(sequences))
87
+ vocab_size = len(vocab)
88
+ vocab_dict = {char: i for i, char in enumerate(vocab)}
89
+
90
+ def seq_to_indices(seq, vocab_dict):
91
+ """将序列转换为索引"""
92
+ return [vocab_dict[char] for char in seq]
93
+
94
+ train_indices = [seq_to_indices(seq, vocab_dict) for seq in train_sequences]
95
+ test_indices = [seq_to_indices(seq, vocab_dict) for seq in test_sequences]
96
+
97
+ def pad_sequences(sequences, max_len=None, pad_value=0):
98
+ """将序列填充到相同的长度"""
99
+ if max_len is None:
100
+ max_len = max(len(seq) for seq in sequences)
101
+ padded_sequences = torch.zeros((len(sequences), max_len), dtype=torch.long)
102
+ for i, seq in enumerate(sequences):
103
+ padded_sequences[i, :len(seq)] = torch.tensor(seq)
104
+ return padded_sequences
105
+
106
+ # 填充序列
107
+ train_indices_padded = pad_sequences(train_indices, max_len=target_length)
108
+ test_indices_padded = pad_sequences(test_indices, max_len=target_length)
109
+
110
+ # 初始化嵌入模型
111
+ embedding_model = EmbeddingPretrainedModel(vocab_size, embedding_dim, target_length).to(device)
112
+
113
+ # 获取嵌入表示
114
+ with torch.no_grad():
115
+ train_embedding_output = embedding_model(train_indices_padded.to(device))
116
+ test_embedding_output = embedding_model(test_indices_padded.to(device))
117
+
118
+ # 合并 ESM 和嵌入表示
119
+ train_combined_representations = esm_ratio * train_esm_representations + (1 - esm_ratio) * train_embedding_output
120
+ test_combined_representations = esm_ratio * test_esm_representations + (1 - esm_ratio) * test_embedding_output
121
+
122
+ # 保存为 .npy 文件
123
+ np.save(os.path.join(save_dir, "train_combined_representations.npy"), train_combined_representations.cpu().numpy())
124
+ np.save(os.path.join(save_dir, "test_combined_representations.npy"), test_combined_representations.cpu().numpy())
125
+ np.save(os.path.join(save_dir, "train_labels.npy"), train_labels)
126
+ np.save(os.path.join(save_dir, "test_labels.npy"), test_labels)
127
+
128
+ print(f"Preprocessed data and representations saved to {save_dir}")