| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sat Jul 31 21:54:08 2021 | |
| @author: Osama | |
| """ | |
| from torch.utils.data import Dataset | |
| from Bio.PDB import Polypeptide | |
| import numpy as np | |
| import torch | |
| import pandas as pd | |
| import os | |
| # import esm | |
| import ast | |
| import pdb | |
| class InterpepComplexes(Dataset): | |
| def __init__(self, mode, | |
| encoded_data_directory = "../../datasets/interpep_data/"): | |
| self.mode = mode | |
| self.encoded_data_directory = encoded_data_directory | |
| self.train_dir = "../../datasets/interpep_data/train_examples.npy" | |
| self.test_dir = "../../datasets/interpep_data/test_examples.npy" | |
| self.val_dir = "../../datasets/interpep_data/val_examples.npy" | |
| self.test_list = np.load(self.test_dir) | |
| self.train_list = np.load(self.train_dir) | |
| self.val_list = np.load(self.val_dir) | |
| if mode == "train": | |
| self.num_data = len(self.train_list) | |
| elif mode == "val": | |
| self.num_data = len(self.val_list) | |
| elif mode == "test": | |
| self.num_data = len(self.test_list) | |
| def __getitem__(self, index): | |
| if self.mode == "train": | |
| item = self.train_list[index] | |
| elif self.mode == "val": | |
| item = self.val_list[index] | |
| elif self.mode == "test": | |
| item = self.test_list[index] | |
| file_dir = self.encoded_data_directory | |
| with np.load(file_dir + "fragment_data/" + item + ".npz") as data: | |
| temp_pep_sequence = data["target_sequence"] | |
| temp_binding_sites = data["binding_sites"] | |
| with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ | |
| item.split("_")[1] + ".npz") as data: | |
| temp_nodes = data["nodes"] | |
| binding = np.zeros(len(temp_nodes)) | |
| if len(temp_binding_sites) != 0: | |
| binding[temp_binding_sites] = 1 | |
| target = torch.LongTensor(binding) | |
| nodes = temp_nodes[:, 0:20] | |
| prot_sequence = np.argmax(nodes, axis=-1) | |
| prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) | |
| pep_sequence = temp_pep_sequence | |
| pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) | |
| return pep_sequence, prot_sequence, target | |
| def __len__(self): | |
| return self.num_data | |
| class PPI(Dataset): | |
| def __init__(self, mode, csv_dir_path = "/home/u21307130002/PepNN/pepnn/datasets/ppi/"): | |
| self.mode = mode | |
| self.train_data = pd.read_csv(os.path.join(csv_dir_path, 'train.csv')) | |
| self.val_data = pd.read_csv(os.path.join(csv_dir_path, 'val.csv')) | |
| # self.test_data = pd.read_csv(os.path.join(csv_dir_path, 'test.csv')) | |
| if self.mode == 'train': | |
| self.num_data = len(self.train_data) | |
| def __len__(self): | |
| return self.num_data | |
| def __getitem__(self, index): | |
| # pdb.set_trace() | |
| if torch.is_tensor(index): | |
| index = index.tolist() | |
| if self.mode == "train": | |
| item = self.train_data.iloc[index] | |
| elif self.mode == "val": | |
| item = self.val_data.iloc[index] | |
| elif self.mode == "test": | |
| item = self.test_data.iloc[index] | |
| else: | |
| item = None | |
| # print(item) | |
| motif1 = ast.literal_eval(item['Chain_1_motifs']) | |
| motif2 = ast.literal_eval(item['Chain_2_motifs']) | |
| if len(motif1[0]) > len(motif2[0]): | |
| target = motif1 | |
| prot_sequence = item['Sequence1'] | |
| pep_sequence = item['Sequence2'] | |
| else: | |
| target = motif2 | |
| pep_sequence = item['Sequence1'] | |
| prot_sequence = item['Sequence2'] | |
| target = [int(motif.split('_')[1]) for motif in target] | |
| if target[-1] >= len(prot_sequence): | |
| pdb.set_trace() | |
| binding = np.zeros(len(prot_sequence)) | |
| if len(target) != 0: | |
| binding[target] = 1 | |
| target = torch.LongTensor(binding).float() | |
| # print(f"peptide length: {len(pep_sequence)}") | |
| # print(f"protein length: {len(prot_sequence)}") | |
| # print(f"target length: {len(target)}") | |
| # pdb.set_trace() | |
| return pep_sequence, prot_sequence, target | |
| class PepBindComplexes(Dataset): | |
| def __init__(self, mode, | |
| encoded_data_directory = "../../datasets/pepbind_data/"): | |
| self.mode = mode | |
| self.encoded_data_directory = encoded_data_directory | |
| self.train_dir = "../../datasets/pepbind_data/train_examples.npy" | |
| self.test_dir = "../../datasets/pepbind_data/test_examples.npy" | |
| self.val_dir = "../../datasets/pepbind_data/val_examples.npy" | |
| self.test_list = np.load(self.test_dir) | |
| self.train_list = np.load(self.train_dir) | |
| self.val_list = np.load(self.val_dir) | |
| if mode == "train": | |
| self.num_data = len(self.train_list) | |
| elif mode == "val": | |
| self.num_data = len(self.val_list) | |
| elif mode == "test": | |
| self.num_data = len(self.test_list) | |
| def __getitem__(self, index): | |
| if self.mode == "train": | |
| item = self.train_list[index] | |
| elif self.mode == "val": | |
| item = self.val_list[index] | |
| elif self.mode == "test": | |
| item = self.test_list[index] | |
| file_dir = self.encoded_data_directory | |
| with np.load(file_dir + "fragment_data/" + item + ".npz") as data: | |
| temp_pep_sequence = data["target_sequence"] | |
| temp_binding_sites = data["binding_sites"] | |
| with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ | |
| item.split("_")[1] + ".npz") as data: | |
| temp_nodes = data["nodes"] | |
| binding = np.zeros(len(temp_nodes)) | |
| if len(temp_binding_sites) != 0: | |
| binding[temp_binding_sites] = 1 | |
| target = torch.LongTensor(binding) | |
| nodes = temp_nodes[:, 0:20] | |
| prot_sequence = np.argmax(nodes, axis=-1) | |
| prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) | |
| pep_sequence = temp_pep_sequence | |
| pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) | |
| return pep_sequence, prot_sequence, target | |
| def __len__(self): | |
| return self.num_data | |
| class PeptideComplexes(Dataset): | |
| def __init__(self, mode, | |
| encoded_data_directory = "../../datasets/pepnn_data/all_data/"): | |
| self.mode = mode | |
| self.encoded_data_directory = encoded_data_directory | |
| self.train_dir = "../../datasets/pepnn_data/train_examples.npy" | |
| self.test_dir = "../../datasets/pepnn_test_data/test_examples.npy" | |
| self.val_dir = "../../datasets/pepnn_data/val_examples.npy" | |
| self.example_weights = np.load("../../datasets/pepnn_data/example_weights.npy") | |
| self.test_list = np.load(self.test_dir) | |
| self.train_list = np.load(self.train_dir) | |
| self.val_list = np.load(self.val_dir) | |
| if mode == "train": | |
| self.num_data = len(self.train_list) | |
| elif mode == "val": | |
| self.num_data = len(self.val_list) | |
| elif mode == "test": | |
| self.num_data = len(self.test_list) | |
| def __getitem__(self, index): | |
| if self.mode == "train": | |
| item = self.train_list[index] | |
| weight = self.example_weights[item] | |
| elif self.mode == "val": | |
| item = self.val_list[index] | |
| weight = self.example_weights[item] | |
| elif self.mode == "test": | |
| item = self.test_list[index] | |
| weight = 1 | |
| if self.mode != "test": | |
| file_dir = self.encoded_data_directory | |
| else: | |
| file_dir = "../../datasets/pepnn_test_data/all_data/" | |
| with np.load(file_dir + "fragment_data/" + item + ".npz") as data: | |
| temp_pep_sequence = data["target_sequence"] | |
| temp_binding_sites = data["binding_sites"] | |
| with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ | |
| item.split("_")[1] + ".npz") as data: | |
| temp_nodes = data["nodes"] | |
| binding = np.zeros(len(temp_nodes)) | |
| if len(temp_binding_sites) != 0: | |
| binding[temp_binding_sites] = 1 | |
| target = torch.LongTensor(binding) | |
| nodes = temp_nodes[:, 0:20] | |
| prot_sequence = np.argmax(nodes, axis=-1) | |
| prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) | |
| pep_sequence = temp_pep_sequence | |
| pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) | |
| return pep_sequence, prot_sequence, target, weight | |
| def __len__(self): | |
| return self.num_data | |
| class BitenetComplexes(Dataset): | |
| def __init__(self, encoded_data_directory = "../bitenet_data/all_data/"): | |
| self.encoded_data_directory = encoded_data_directory | |
| self.train_dir = "../../datasets/bitenet_data/examples.npy" | |
| self.full_list = np.load(self.train_dir) | |
| self.num_data = len(self.full_list) | |
| def __getitem__(self, index): | |
| item = self.full_list[index] | |
| file_dir = self.encoded_data_directory | |
| with np.load(file_dir + "fragment_data/" + item[:-1] + "_" + item[-1] + ".npz") as data: | |
| temp_pep_sequence = data["target_sequence"] | |
| temp_binding_matrix = data["binding_matrix"] | |
| with np.load(file_dir + "receptor_data/" + item.split("_")[0] + "_" +\ | |
| item.split("_")[1][0] + ".npz") as data: | |
| temp_nodes = data["nodes"] | |
| binding_sum = np.sum(temp_binding_matrix, axis=0).T | |
| target = torch.LongTensor(binding_sum >= 1) | |
| nodes = temp_nodes[:, 0:20] | |
| prot_sequence = np.argmax(nodes, axis=-1) | |
| prot_sequence = " ".join([Polypeptide.index_to_one(i) for i in prot_sequence]) | |
| pep_sequence = temp_pep_sequence | |
| pep_sequence = torch.argmax(torch.FloatTensor(pep_sequence), dim=-1) | |
| return pep_sequence, prot_sequence, target | |
| def __len__(self): | |
| return self.num_data |
Xet Storage Details
- Size:
- 11.4 kB
- Xet hash:
- e7967fdb8ca3ec5ac9d4fe5706b6c84db12dc9376614948c8db202afa86d4d16
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.