fondress commited on
Commit
db04b68
·
verified ·
1 Parent(s): 5522f60

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +65 -41
README.md CHANGED
@@ -47,50 +47,74 @@ Here is an example of how to use PDeepPP to process protein sequences and obtain
47
 
48
  ```python
49
  import torch
50
- import numpy as np
 
 
51
  from transformers import AutoModel
52
- from processing_pdeeppp import PDeepPPProcessor
53
 
54
- # 加载预训练的特征表示
55
- train_representations_path = "./pretrained_weights/Hydroxyproline_P/train_combined_representations.npy" # 替换为你的路径
56
- test_representations_path = "./pretrained_weights/Hydroxyproline_P/test_combined_representations.npy" # 替换为你的路径
57
-
58
- # 检查文件是否存在
59
- assert os.path.exists(train_representations_path), "预训练的 train_combined_representations.npy 文件不存在!"
60
- assert os.path.exists(test_representations_path), "预训练的 test_combined_representations.npy 文件不存在!"
61
-
62
- # 加载预训练特征
63
- train_representations = np.load(train_representations_path)
64
- test_representations = np.load(test_representations_path)
65
-
66
- # 转换为 PyTorch 张量
67
- train_representations_tensor = torch.tensor(train_representations)
68
- test_representations_tensor = torch.tensor(test_representations)
69
-
70
- # 加载 `PDeepPP` 模型
71
- device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
72
- print(f"Using {device} device")
73
- model = AutoModel.from_pretrained("fondress/PDeepPP_ACE", trust_remote_code=True)
74
- model.to(device)
75
-
76
- # Example protein sequences
77
- protein_sequences = ["MKVSTYSTQ", "MSRSTYV"]
78
-
79
- # 初始化 PDeepPPProcessor
80
- processor = PDeepPPProcessor(pad_char="X", target_length=33)
81
-
82
- # 预处理序列
83
- inputs = processor(sequences=protein_sequences, ptm_mode=True, return_tensors="pt") # 设置 ptm_mode=True 处理 PTM 数据
84
-
85
- # 替换模型输入的嵌入表示为预训练特征
86
- # 假设 inputs["input_embeds"] 是需要被替换的嵌入
87
- # 在此处选择测试集中的预训练特征作为示例
88
- inputs["input_embeds"] = test_representations_tensor[:len(protein_sequences)].to(device)
89
-
90
- # 进行预测
 
 
 
 
 
 
 
 
 
 
91
  model.eval()
92
- outputs = model(**inputs)
93
- print(outputs["logits"])
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  ```
95
 
96
  ## Training and customization
 
47
 
48
  ```python
49
  import torch
50
+ import esm
51
+ from DataProcessor_pdeeppp import PDeepPPProcessor
52
+ from Pretraining_pdeeppp import PretrainingPDeepPP
53
  from transformers import AutoModel
 
54
 
55
+ # Global parameter settings
56
+ device = torch.device("cpu")
57
+ pad_char = "X" # Padding character
58
+ target_length = 33 # Target length for sequence padding
59
+ mode = "BPS" # Mode setting (only configured in example.py)
60
+ esm_ratio = 1 # Ratio for ESM embeddings
61
+
62
+ # Initialize the PDeepPPProcessor
63
+ processor = PDeepPPProcessor(pad_char=pad_char, target_length=target_length)
64
+
65
+ # Example protein sequences (test sequences)
66
+ protein_sequences = ["VELYP", "YPLDL", "ESHINQKWVCK"]
67
+
68
+ # Preprocess the sequences
69
+ inputs = processor(sequences=protein_sequences, mode=mode, return_tensors="pt") # Dynamic mode parameter
70
+ processed_sequences = inputs["raw_sequences"]
71
+
72
+ # Load the ESM model
73
+ esm_model, esm_alphabet = esm.pretrained.esm2_t33_650M_UR50D()
74
+ esm_model = esm_model.to(device)
75
+ esm_model.eval()
76
+
77
+ # Initialize the PretrainingPDeepPP module
78
+ pretrainer = PretrainingPDeepPP(
79
+ embedding_dim=1280,
80
+ target_length=target_length,
81
+ esm_ratio=esm_ratio,
82
+ device=device
83
+ )
84
+
85
+ # Extract the vocabulary and ensure the padding character 'X' is included
86
+ vocab = set("".join(protein_sequences))
87
+ vocab.add(pad_char) # Add the padding character
88
+
89
+ # Generate pretrained features using the PretrainingPDeepPP module
90
+ pretrained_features = pretrainer.create_embeddings(
91
+ processed_sequences, vocab, esm_model, esm_alphabet
92
+ )
93
+
94
+ # Ensure pretrained features are on the same device
95
+ inputs["input_embeds"] = pretrained_features.to(device)
96
+
97
+ # Load the PDeepPP model
98
+ model_name = "fondress/PDeepPP_ACE"
99
+ model = AutoModel.from_pretrained(model_name, trust_remote_code=True) # Directly load the model
100
+
101
+ # Perform prediction
102
  model.eval()
103
+ outputs = model(input_embeds=inputs["input_embeds"]) # Use pretrained features as model input
104
+ logits = outputs["logits"]
105
+
106
+ # Compute probability distributions and generate predictions
107
+ softmax = torch.nn.Softmax(dim=-1) # Apply softmax on the last dimension
108
+ probabilities = softmax(logits)
109
+ predicted_labels = (probabilities >= 0.5).long()
110
+
111
+ # Print the prediction results for each sequence
112
+ print("\nPrediction Results:")
113
+ for i, seq in enumerate(processed_sequences):
114
+ print(f"Sequence: {seq}")
115
+ print(f"Probability: {probabilities[i].item():.4f}")
116
+ print(f"Predicted Label: {predicted_labels[i].item()}")
117
+ print("-" * 50)
118
  ```
119
 
120
  ## Training and customization