fondress
/

PDeepPP_ACE

@@ -47,50 +47,74 @@ Here is an example of how to use PDeepPP to process protein sequences and obtain
 ```python
 import torch
-import numpy as np
 from transformers import AutoModel
-from processing_pdeeppp import PDeepPPProcessor
-# 加载预训练的特征表示
-train_representations_path = "./pretrained_weights/Hydroxyproline_P/train_combined_representations.npy"  # 替换为你的路径
-test_representations_path = "./pretrained_weights/Hydroxyproline_P/test_combined_representations.npy"  # 替换为你的路径
-# 检查文件是否存在
-assert os.path.exists(train_representations_path), "预训练的 train_combined_representations.npy 文件不存在！"
-assert os.path.exists(test_representations_path), "预训练的 test_combined_representations.npy 文件不存在！"
-# 加载预训练特征
-train_representations = np.load(train_representations_path)
-test_representations = np.load(test_representations_path)
-# 转换为 PyTorch 张量
-train_representations_tensor = torch.tensor(train_representations)
-test_representations_tensor = torch.tensor(test_representations)
-# 加载 `PDeepPP` 模型
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print(f"Using {device} device")
-model = AutoModel.from_pretrained("fondress/PDeepPP_ACE", trust_remote_code=True)
-model.to(device)
-# Example protein sequences
-protein_sequences = ["MKVSTYSTQ", "MSRSTYV"]
-# 初始化 PDeepPPProcessor
-processor = PDeepPPProcessor(pad_char="X", target_length=33)
-# 预处理序列
-inputs = processor(sequences=protein_sequences, ptm_mode=True, return_tensors="pt")  # 设置 ptm_mode=True 处理 PTM 数据
-# 替换模型输入的嵌入表示为预训练特征
-# 假设 inputs["input_embeds"] 是需要被替换的嵌入
-# 在此处选择测试集中的预训练特征作为示例
-inputs["input_embeds"] = test_representations_tensor[:len(protein_sequences)].to(device)
-# 进行预测
 model.eval()
-outputs = model(**inputs)
-print(outputs["logits"])
 ```
 ## Training and customization

 ```python
 import torch
+import esm
+from DataProcessor_pdeeppp import PDeepPPProcessor
+from Pretraining_pdeeppp import PretrainingPDeepPP
 from transformers import AutoModel
+# Global parameter settings
+device = torch.device("cpu")
+pad_char = "X"  # Padding character
+target_length = 33  # Target length for sequence padding
+mode = "BPS"  # Mode setting (only configured in example.py)
+esm_ratio = 1  # Ratio for ESM embeddings
+# Initialize the PDeepPPProcessor
+processor = PDeepPPProcessor(pad_char=pad_char, target_length=target_length)
+# Example protein sequences (test sequences)
+protein_sequences = ["VELYP", "YPLDL", "ESHINQKWVCK"]
+# Preprocess the sequences
+inputs = processor(sequences=protein_sequences, mode=mode, return_tensors="pt")  # Dynamic mode parameter
+processed_sequences = inputs["raw_sequences"]
+# Load the ESM model
+esm_model, esm_alphabet = esm.pretrained.esm2_t33_650M_UR50D()
+esm_model = esm_model.to(device)
+esm_model.eval()
+# Initialize the PretrainingPDeepPP module
+pretrainer = PretrainingPDeepPP(
+    embedding_dim=1280,
+    target_length=target_length,
+    esm_ratio=esm_ratio,
+    device=device
+)
+# Extract the vocabulary and ensure the padding character 'X' is included
+vocab = set("".join(protein_sequences))
+vocab.add(pad_char)  # Add the padding character
+# Generate pretrained features using the PretrainingPDeepPP module
+pretrained_features = pretrainer.create_embeddings(
+    processed_sequences, vocab, esm_model, esm_alphabet
+)
+# Ensure pretrained features are on the same device
+inputs["input_embeds"] = pretrained_features.to(device)
+# Load the PDeepPP model
+model_name = "fondress/PDeepPP_ACE"
+model = AutoModel.from_pretrained(model_name, trust_remote_code=True)  # Directly load the model
+# Perform prediction
 model.eval()
+outputs = model(input_embeds=inputs["input_embeds"])  # Use pretrained features as model input
+logits = outputs["logits"]
+# Compute probability distributions and generate predictions
+softmax = torch.nn.Softmax(dim=-1)  # Apply softmax on the last dimension
+probabilities = softmax(logits)
+predicted_labels = (probabilities >= 0.5).long()
+# Print the prediction results for each sequence
+print("\nPrediction Results:")
+for i, seq in enumerate(processed_sequences):
+    print(f"Sequence: {seq}")
+    print(f"Probability: {probabilities[i].item():.4f}")
+    print(f"Predicted Label: {predicted_labels[i].item()}")
+    print("-" * 50)
 ```
 ## Training and customization