| | |
| | """ |
| | Created on Sat Jul 31 21:54:08 2021 |
| | |
| | @author: Osama |
| | """ |
| |
|
| | from torch.utils.data import Dataset |
| | from Bio.PDB import Polypeptide |
| | import numpy as np |
| | import torch |
| | import pandas as pd |
| | import os |
| | |
| | import ast |
| | import pdb |
| |
|
| | |
| | class InterpepComplexes(Dataset): |
| | |
| | def __init__(self, mode, |
| | encoded_data_directory = "../../datasets/interpep_data/"): |
| | |
| | self.mode = mode |
| | |
| | self.encoded_data_directory = encoded_data_directory |
| | |
| | self.train_dir = "../../datasets/interpep_data/train_examples.npy" |
| | |
| | self.test_dir = "../../datasets/interpep_data/test_examples.npy" |
| | |
| | self.val_dir = "../../datasets/interpep_data/val_examples.npy" |
| | |
| | |
| | self.test_list = np.load(self.test_dir) |
| |
|
| | self.train_list = np.load(self.train_dir) |
| | |
| | self.val_list = np.load(self.val_dir) |
| | |
| |
|
| | |
| | if mode == "train": |
| | self.num_data = len(self.train_list) |
| | elif mode == "val": |
| | self.num_data = len(self.val_list) |
| | elif mode == "test": |
| | self.num_data = len(self.test_list) |
| | |
| |
|
| | |
| | def __getitem__(self, index): |
| | |
| | if self.mode == "train": |
| | item = self.train_list[index] |
| | elif self.mode == "val": |
| | item = self.val_list[index] |
| | elif self.mode == "test": |
| | item = self.test_list[index] |
| | |
| | file_dir = self.encoded_data_directory |
| | |
| | with np.load(file_dir + "fragment_data/" + item + ".npz") as data: |
| | temp_pep_sequence = data["target_sequence"] |
| | temp_binding_sites = data["binding_sites"] |
| | |
| | |
| | with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ |
| | item.split("_")[1] + ".npz") as data: |
| | temp_nodes = data["nodes"] |
| | |
| | |
| | binding = np.zeros(len(temp_nodes)) |
| | if len(temp_binding_sites) != 0: |
| | binding[temp_binding_sites] = 1 |
| | target = torch.LongTensor(binding) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | nodes = temp_nodes[:, 0:20] |
| | |
| | prot_sequence = np.argmax(nodes, axis=-1) |
| | |
| | |
| | |
| | prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) |
| | |
| |
|
| | |
| | pep_sequence = temp_pep_sequence |
| | |
| | pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) |
| | |
| |
|
| | |
| | |
| | |
| | return pep_sequence, prot_sequence, target |
| | |
| | def __len__(self): |
| | return self.num_data |
| |
|
| | class PPI(Dataset): |
| |
|
| | def __init__(self, mode, csv_dir_path = "/home/u21307130002/PepNN/pepnn/datasets/ppi/"): |
| |
|
| | self.mode = mode |
| | self.train_data = pd.read_csv(os.path.join(csv_dir_path, 'train.csv')) |
| | self.val_data = pd.read_csv(os.path.join(csv_dir_path, 'val.csv')) |
| | |
| |
|
| | if self.mode == 'train': |
| | self.num_data = len(self.train_data) |
| |
|
| | def __len__(self): |
| | return self.num_data |
| |
|
| | def __getitem__(self, index): |
| | |
| | if torch.is_tensor(index): |
| | index = index.tolist() |
| |
|
| | if self.mode == "train": |
| | item = self.train_data.iloc[index] |
| | elif self.mode == "val": |
| | item = self.val_data.iloc[index] |
| | elif self.mode == "test": |
| | item = self.test_data.iloc[index] |
| | else: |
| | item = None |
| |
|
| | |
| |
|
| | motif1 = ast.literal_eval(item['Chain_1_motifs']) |
| | motif2 = ast.literal_eval(item['Chain_2_motifs']) |
| |
|
| | if len(motif1[0]) > len(motif2[0]): |
| | target = motif1 |
| | prot_sequence = item['Sequence1'] |
| | pep_sequence = item['Sequence2'] |
| | else: |
| | target = motif2 |
| | pep_sequence = item['Sequence1'] |
| | prot_sequence = item['Sequence2'] |
| |
|
| | target = [int(motif.split('_')[1]) for motif in target] |
| |
|
| | if target[-1] >= len(prot_sequence): |
| | pdb.set_trace() |
| |
|
| | binding = np.zeros(len(prot_sequence)) |
| | if len(target) != 0: |
| | binding[target] = 1 |
| | target = torch.LongTensor(binding).float() |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | return pep_sequence, prot_sequence, target |
| |
|
| |
|
| |
|
| |
|
| | class PepBindComplexes(Dataset): |
| | |
| | def __init__(self, mode, |
| | encoded_data_directory = "../../datasets/pepbind_data/"): |
| | |
| | self.mode = mode |
| | |
| | self.encoded_data_directory = encoded_data_directory |
| | |
| | self.train_dir = "../../datasets/pepbind_data/train_examples.npy" |
| | |
| | self.test_dir = "../../datasets/pepbind_data/test_examples.npy" |
| | |
| | self.val_dir = "../../datasets/pepbind_data/val_examples.npy" |
| | |
| | |
| | self.test_list = np.load(self.test_dir) |
| |
|
| | self.train_list = np.load(self.train_dir) |
| | |
| | self.val_list = np.load(self.val_dir) |
| | |
| | |
| | if mode == "train": |
| | self.num_data = len(self.train_list) |
| | elif mode == "val": |
| | self.num_data = len(self.val_list) |
| | elif mode == "test": |
| | self.num_data = len(self.test_list) |
| | |
| |
|
| | |
| | def __getitem__(self, index): |
| | |
| | if self.mode == "train": |
| | item = self.train_list[index] |
| | |
| | |
| | elif self.mode == "val": |
| | item = self.val_list[index] |
| | |
| | |
| | elif self.mode == "test": |
| | item = self.test_list[index] |
| | |
| | |
| | |
| | file_dir = self.encoded_data_directory |
| | |
| | |
| | with np.load(file_dir + "fragment_data/" + item + ".npz") as data: |
| | temp_pep_sequence = data["target_sequence"] |
| | temp_binding_sites = data["binding_sites"] |
| | |
| | |
| | with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ |
| | item.split("_")[1] + ".npz") as data: |
| | temp_nodes = data["nodes"] |
| | |
| | |
| | binding = np.zeros(len(temp_nodes)) |
| | if len(temp_binding_sites) != 0: |
| | binding[temp_binding_sites] = 1 |
| | target = torch.LongTensor(binding) |
| |
|
| | nodes = temp_nodes[:, 0:20] |
| | |
| | prot_sequence = np.argmax(nodes, axis=-1) |
| |
|
| | |
| | prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) |
| |
|
| | |
| | pep_sequence = temp_pep_sequence |
| | |
| | pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) |
| |
|
| | |
| | return pep_sequence, prot_sequence, target |
| | |
| | |
| | def __len__(self): |
| | return self.num_data |
| | |
| | class PeptideComplexes(Dataset): |
| | |
| | def __init__(self, mode, |
| | encoded_data_directory = "../../datasets/pepnn_data/all_data/"): |
| | |
| | self.mode = mode |
| | |
| | self.encoded_data_directory = encoded_data_directory |
| | |
| | self.train_dir = "../../datasets/pepnn_data/train_examples.npy" |
| | |
| | self.test_dir = "../../datasets/pepnn_test_data/test_examples.npy" |
| | |
| | self.val_dir = "../../datasets/pepnn_data/val_examples.npy" |
| | |
| | |
| | self.example_weights = np.load("../../datasets/pepnn_data/example_weights.npy") |
| | |
| | self.test_list = np.load(self.test_dir) |
| |
|
| | self.train_list = np.load(self.train_dir) |
| | |
| | self.val_list = np.load(self.val_dir) |
| | |
| |
|
| | |
| | if mode == "train": |
| | self.num_data = len(self.train_list) |
| | elif mode == "val": |
| | self.num_data = len(self.val_list) |
| | elif mode == "test": |
| | self.num_data = len(self.test_list) |
| | |
| |
|
| | |
| | def __getitem__(self, index): |
| | |
| | |
| | if self.mode == "train": |
| | item = self.train_list[index] |
| | |
| | weight = self.example_weights[item] |
| | |
| | elif self.mode == "val": |
| | item = self.val_list[index] |
| | |
| | weight = self.example_weights[item] |
| | |
| | elif self.mode == "test": |
| | item = self.test_list[index] |
| | |
| | weight = 1 |
| | |
| | if self.mode != "test": |
| | file_dir = self.encoded_data_directory |
| | else: |
| | file_dir = "../../datasets/pepnn_test_data/all_data/" |
| | |
| | |
| | with np.load(file_dir + "fragment_data/" + item + ".npz") as data: |
| | temp_pep_sequence = data["target_sequence"] |
| | temp_binding_sites = data["binding_sites"] |
| | |
| | |
| | with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ |
| | item.split("_")[1] + ".npz") as data: |
| | temp_nodes = data["nodes"] |
| | |
| | |
| | binding = np.zeros(len(temp_nodes)) |
| | if len(temp_binding_sites) != 0: |
| | binding[temp_binding_sites] = 1 |
| | target = torch.LongTensor(binding) |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | nodes = temp_nodes[:, 0:20] |
| | |
| | prot_sequence = np.argmax(nodes, axis=-1) |
| | |
| | |
| | |
| | prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) |
| | |
| |
|
| | |
| | pep_sequence = temp_pep_sequence |
| | |
| | pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) |
| | |
| |
|
| | |
| | |
| | |
| | return pep_sequence, prot_sequence, target, weight |
| | |
| | |
| | def __len__(self): |
| | return self.num_data |
| | |
| | |
| | class BitenetComplexes(Dataset): |
| | |
| | def __init__(self, encoded_data_directory = "../bitenet_data/all_data/"): |
| |
|
| | |
| | self.encoded_data_directory = encoded_data_directory |
| | |
| | |
| |
|
| |
|
| | self.train_dir = "../../datasets/bitenet_data/examples.npy" |
| | |
| | |
| | |
| |
|
| | self.full_list = np.load(self.train_dir) |
| | |
| |
|
| | |
| | |
| | self.num_data = len(self.full_list) |
| | |
| | |
| |
|
| | |
| | def __getitem__(self, index): |
| | |
| | item = self.full_list[index] |
| | |
| | file_dir = self.encoded_data_directory |
| | |
| | with np.load(file_dir + "fragment_data/" + item[:-1] + "_" + item[-1] + ".npz") as data: |
| | temp_pep_sequence = data["target_sequence"] |
| | temp_binding_matrix = data["binding_matrix"] |
| | |
| | |
| | with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ |
| | item.split("_")[1][0] + ".npz") as data: |
| | temp_nodes = data["nodes"] |
| | |
| | |
| | binding_sum = np.sum(temp_binding_matrix, axis=0).T |
| | |
| | target = torch.LongTensor(binding_sum >= 1) |
| | |
| | |
| | |
| | nodes = temp_nodes[:, 0:20] |
| | |
| | prot_sequence = np.argmax(nodes, axis=-1) |
| | |
| | |
| | |
| | prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) |
| | |
| |
|
| | |
| | pep_sequence = temp_pep_sequence |
| | |
| | pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) |
| | |
| | |
| | |
| | |
| | return pep_sequence, prot_sequence, target |
| | |
| | def __len__(self): |
| | return self.num_data |