Spaces:

AlexSychovUN
/

BindingAffinityPrediction

Sleeping

App Files Files Community

BindingAffinityPrediction / research /GNN_classification /Dataset_Preparation.py

AlexSychovUN

Prepared for deploy

13188b8 about 1 month ago

raw

history blame contribute delete

4.04 kB

	import numpy as np
	import torch
	import pandas as pd
	from rdkit import Chem, rdBase
	from torch_geometric.data import Data
	from torch.utils.data import Dataset

	rdBase.DisableLog("rdApp.*")


	def one_of_k_encoding(x, allowable_set):
	# last position - unknown
	if x not in allowable_set:
	x = allowable_set[-1]
	return list(map(lambda s: x == s, allowable_set))


	def get_atom_features(atom):
	symbols_list = [
	"C",
	"N",
	"O",
	"S",
	"F",
	"Si",
	"P",
	"Cl",
	"Br",
	"Mg",
	"Na",
	"Ca",
	"Fe",
	"As",
	"Al",
	"I",
	"B",
	"V",
	"K",
	"Tl",
	"Yb",
	"Sb",
	"Sn",
	"Ag",
	"Pd",
	"Co",
	"Se",
	"Ti",
	"Zn",
	"H",
	"Li",
	"Ge",
	"Cu",
	"Au",
	"Ni",
	"Cd",
	"In",
	"Mn",
	"Zr",
	"Cr",
	"Pt",
	"Hg",
	"Pb",
	"Unknown",
	]
	degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
	numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
	implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
	return np.array(
	# Type of atom (Symbol)
	one_of_k_encoding(atom.GetSymbol(), symbols_list)
	+
	# Number of neighbours (Degree)
	one_of_k_encoding(atom.GetDegree(), degrees_list)
	+
	# Number of hydrogen atoms (Implicit Hs) - bond donors
	one_of_k_encoding(atom.GetTotalNumHs(), numhs_list)
	+
	# Valence - chemical potential
	one_of_k_encoding(atom.GetImplicitValence(), implicit_valences_list)
	+
	# Hybridization - so important for 3d structure, sp2 - Trigonal planar, sp3 - Tetrahedral
	one_of_k_encoding(
	atom.GetHybridization(),
	[
	Chem.rdchem.HybridizationType.SP,
	Chem.rdchem.HybridizationType.SP2,
	Chem.rdchem.HybridizationType.SP3,
	Chem.rdchem.HybridizationType.SP3D,
	Chem.rdchem.HybridizationType.SP3D2,
	"other",
	],
	)
	+
	# Aromaticity (Boolean)
	[atom.GetIsAromatic()]
	)


	class SmilesDataset(Dataset):
	def __init__(self, dataframe):
	self.data = dataframe

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	row = self.data.iloc[idx]
	smiles = row["smiles"]
	label = row["label"]

	mol = Chem.MolFromSmiles(smiles)
	if mol is None:
	return None

	# Nodes
	atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
	x = torch.tensor(np.array(atom_features), dtype=torch.float)

	# Edges
	edge_indexes = []
	for bond in mol.GetBonds():
	i = bond.GetBeginAtomIdx()
	j = bond.GetEndAtomIdx()
	edge_indexes.append((i, j))
	edge_indexes.append((j, i))

	# t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
	# contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially

	edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()

	# Label
	y = torch.tensor([label], dtype=torch.long)
	return Data(x=x, edge_index=edge_index, y=y)


	if __name__ == "__main__":
	columns = ["smiles", "label"]
	train_dataset = pd.read_csv(
	"dataset/classification/data_train.txt", sep=" ", header=None, names=columns
	)
	test_dataset = pd.read_csv(
	"dataset/classification/data_test.txt", sep=" ", header=None, names=columns
	)
	train_dataset.to_csv("dataset/classification/data_train.csv", index=False)
	test_dataset.to_csv("dataset/classification/data_test.csv", index=False)

	train_dataset = SmilesDataset(train_dataset)
	test_dataset = SmilesDataset(test_dataset)

	print(len(train_dataset))
	print(len(test_dataset))