AlexSychovUN's picture
Prepared for deploy
13188b8
import numpy as np
import torch
import pandas as pd
from rdkit import Chem, rdBase
from torch_geometric.data import Data
from torch.utils.data import Dataset
rdBase.DisableLog("rdApp.*")
def one_of_k_encoding(x, allowable_set):
# last position - unknown
if x not in allowable_set:
x = allowable_set[-1]
return list(map(lambda s: x == s, allowable_set))
def get_atom_features(atom):
symbols_list = [
"C",
"N",
"O",
"S",
"F",
"Si",
"P",
"Cl",
"Br",
"Mg",
"Na",
"Ca",
"Fe",
"As",
"Al",
"I",
"B",
"V",
"K",
"Tl",
"Yb",
"Sb",
"Sn",
"Ag",
"Pd",
"Co",
"Se",
"Ti",
"Zn",
"H",
"Li",
"Ge",
"Cu",
"Au",
"Ni",
"Cd",
"In",
"Mn",
"Zr",
"Cr",
"Pt",
"Hg",
"Pb",
"Unknown",
]
degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
return np.array(
# Type of atom (Symbol)
one_of_k_encoding(atom.GetSymbol(), symbols_list)
+
# Number of neighbours (Degree)
one_of_k_encoding(atom.GetDegree(), degrees_list)
+
# Number of hydrogen atoms (Implicit Hs) - bond donors
one_of_k_encoding(atom.GetTotalNumHs(), numhs_list)
+
# Valence - chemical potential
one_of_k_encoding(atom.GetImplicitValence(), implicit_valences_list)
+
# Hybridization - so important for 3d structure, sp2 - Trigonal planar, sp3 - Tetrahedral
one_of_k_encoding(
atom.GetHybridization(),
[
Chem.rdchem.HybridizationType.SP,
Chem.rdchem.HybridizationType.SP2,
Chem.rdchem.HybridizationType.SP3,
Chem.rdchem.HybridizationType.SP3D,
Chem.rdchem.HybridizationType.SP3D2,
"other",
],
)
+
# Aromaticity (Boolean)
[atom.GetIsAromatic()]
)
class SmilesDataset(Dataset):
def __init__(self, dataframe):
self.data = dataframe
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
row = self.data.iloc[idx]
smiles = row["smiles"]
label = row["label"]
mol = Chem.MolFromSmiles(smiles)
if mol is None:
return None
# Nodes
atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
x = torch.tensor(np.array(atom_features), dtype=torch.float)
# Edges
edge_indexes = []
for bond in mol.GetBonds():
i = bond.GetBeginAtomIdx()
j = bond.GetEndAtomIdx()
edge_indexes.append((i, j))
edge_indexes.append((j, i))
# t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
# contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially
edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
# Label
y = torch.tensor([label], dtype=torch.long)
return Data(x=x, edge_index=edge_index, y=y)
if __name__ == "__main__":
columns = ["smiles", "label"]
train_dataset = pd.read_csv(
"dataset/classification/data_train.txt", sep=" ", header=None, names=columns
)
test_dataset = pd.read_csv(
"dataset/classification/data_test.txt", sep=" ", header=None, names=columns
)
train_dataset.to_csv("dataset/classification/data_train.csv", index=False)
test_dataset.to_csv("dataset/classification/data_test.csv", index=False)
train_dataset = SmilesDataset(train_dataset)
test_dataset = SmilesDataset(test_dataset)
print(len(train_dataset))
print(len(test_dataset))