Spaces:

AlexSychovUN
/

BindingAffinityPrediction

Sleeping

App Files Files Community

AlexSychovUN commited on Dec 4, 2025

Commit

da5d904

1 Parent(s): da7c0f0

Added files

Browse files

Files changed (4) hide show

.gitignore +1 -2
GNN_classification/Dataset_Preparation.py +49 -8
GNN_classification/dataset/classification/EDA.ipynb +178 -0
GNN_classification/training.py +13 -6

.gitignore CHANGED Viewed

	@@ -1,2 +1 @@
1	- .idea
2	- .ipynb_checkpoints


1	+ .idea

GNN_classification/Dataset_Preparation.py CHANGED Viewed

@@ -1,10 +1,49 @@
 import torch
 import pandas as pd
-from rdkit import Chem
 from torch_geometric.data import Data
 from torch.utils.data import Dataset
 class SmilesDataset(Dataset):
     def __init__(self, dataframe):
@@ -22,8 +61,9 @@ class SmilesDataset(Dataset):
         if mol is None: return None
         # Nodes
-        atom_features = [[atom.GetAtomicNum()] for atom in mol.GetAtoms()]
-        x = torch.tensor(atom_features, dtype=torch.float)
         # Edges
         edge_indexes = []
@@ -35,10 +75,8 @@ class SmilesDataset(Dataset):
         # t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
         # contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially
-        if not edge_indexes:
-            edge_index = torch.empty((2, 0), dtype=torch.long)
-        else:
-            edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
         # Label
@@ -54,10 +92,13 @@ if __name__ == "__main__":
     test_dataset = pd.read_csv(
         "dataset/classification/data_test.txt", sep=" ", header=None, names=columns
     )
     train_dataset = SmilesDataset(train_dataset)
     test_dataset = SmilesDataset(test_dataset)
     print(len(train_dataset))
     print(len(test_dataset))

+import numpy as np
 import torch
 import pandas as pd
+from rdkit import Chem, rdBase
 from torch_geometric.data import Data
 from torch.utils.data import Dataset
+rdBase.DisableLog('rdApp.*')
+def one_of_k_encoding(x, allowable_set):
+    # last position - unknown
+    if x not in allowable_set:
+        x = allowable_set[-1]
+    return list(map(lambda s: x == s, allowable_set))
+def get_atom_features(atom):
+    symbols_list = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']
+    degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    return np.array(
+        # Type of atom (Symbol)
+        one_of_k_encoding(atom.GetSymbol(), symbols_list) +
+        # Number of neighbours (Degree)
+        one_of_k_encoding(atom.GetDegree(), degrees_list) +
+        # Number of hydrogen atoms (Implicit Hs) - bond donors
+        one_of_k_encoding(atom.GetTotalNumHs(), numhs_list) +
+        # Valence - chemical potential
+        one_of_k_encoding(atom.GetImplicitValence(), implicit_valences_list) +
+        # Hybridization - so important for 3d structure, sp2 - Trigonal planar, sp3 - Tetrahedral
+        one_of_k_encoding(atom.GetHybridization(), [
+            Chem.rdchem.HybridizationType.SP,
+            Chem.rdchem.HybridizationType.SP2,
+            Chem.rdchem.HybridizationType.SP3,
+            Chem.rdchem.HybridizationType.SP3D,
+            Chem.rdchem.HybridizationType.SP3D2,
+            'other']) +
+        # Aromaticity (Boolean)
+        [atom.GetIsAromatic()]
+    )
 class SmilesDataset(Dataset):
     def __init__(self, dataframe):
         if mol is None: return None
         # Nodes
+        atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
+        x = torch.tensor(np.array(atom_features), dtype=torch.float)
         # Edges
         edge_indexes = []
         # t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
         # contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially
+        edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
         # Label
     test_dataset = pd.read_csv(
         "dataset/classification/data_test.txt", sep=" ", header=None, names=columns
     )
+    train_dataset.to_csv("dataset/classification/data_train.csv", index=False)
+    test_dataset.to_csv("dataset/classification/data_test.csv", index=False)
     train_dataset = SmilesDataset(train_dataset)
     test_dataset = SmilesDataset(test_dataset)
     print(len(train_dataset))
     print(len(test_dataset))

GNN_classification/dataset/classification/EDA.ipynb ADDED Viewed

	@@ -0,0 +1,178 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2025-12-04T17:09:59.971023Z",
+     "start_time": "2025-12-04T17:09:59.487573Z"
+    }
+   },
+   "source": "import pandas as pd",
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-04T17:10:17.852208Z",
+     "start_time": "2025-12-04T17:10:17.820499Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "train_dataset = pd.read_csv(\"data_train.csv\")",
+   "id": "5602ccb4aefc74b1",
+   "outputs": [],
+   "execution_count": 2
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-04T17:10:28.992611Z",
+     "start_time": "2025-12-04T17:10:28.969929Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "train_dataset.describe()",
+   "id": "426ec6e722b80a8a",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "              label\n",
+       "count  37720.000000\n",
+       "mean       0.035260\n",
+       "std        0.184438\n",
+       "min        0.000000\n",
+       "25%        0.000000\n",
+       "50%        0.000000\n",
+       "75%        0.000000\n",
+       "max        1.000000"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>37720.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>0.035260</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>0.184438</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>0.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>1.000000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-04T17:11:30.023817Z",
+     "start_time": "2025-12-04T17:11:30.012904Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "train_dataset['label'].value_counts()",
+   "id": "355c3ed8e5f76bbf",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "label\n",
+       "0    36390\n",
+       "1     1330\n",
+       "Name: count, dtype: int64"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 4
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "a88bb26653a0eb02"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

GNN_classification/training.py CHANGED Viewed

@@ -1,8 +1,7 @@
 import torch
-import torch.nn as nn
-import torch.nn.functional as F
 import pandas as pd
-from rdkit import Chem
 from torch_geometric.loader import DataLoader
@@ -61,6 +60,9 @@ if __name__ == "__main__":
     train_dataset = SmilesDataset(train_dataset)
     test_dataset = SmilesDataset(test_dataset)
     num_node_features = train_dataset[0].x.shape[1]
     num_classes = 2
@@ -71,7 +73,9 @@ if __name__ == "__main__":
     train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
-    model = GNNClassifier(input_dim=1, output_dim=2, hidden_channels=16).to(DEVICE)
     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
     criterion = torch.nn.CrossEntropyLoss()
@@ -80,7 +84,10 @@ if __name__ == "__main__":
     print("Start Training")
     for epoch in range(1, EPOCHS + 1):
-        train_loss = train_epoch(model, train_loader, optimizer, criterion)
-        train_acc = evaluate(model, train_loader)
         print(f"Epoch: {epoch}, Loss: {train_loss}, Train Accuracy: {train_acc}")

 import torch
 import pandas as pd
 from torch_geometric.loader import DataLoader
     train_dataset = SmilesDataset(train_dataset)
     test_dataset = SmilesDataset(test_dataset)
+    train_dataset = [data for data in train_dataset if data is not None]
+    test_dataset = [data for data in test_dataset if data is not None]
     num_node_features = train_dataset[0].x.shape[1]
     num_classes = 2
     train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
+    model = GNNClassifier(input_dim=num_node_features, output_dim=num_classes, hidden_channels=16).to(DEVICE)
     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
     criterion = torch.nn.CrossEntropyLoss()
     print("Start Training")
     for epoch in range(1, EPOCHS + 1):
+        train_loss = train(model, train_loader, optimizer, criterion)
+        train_acc = test(model, train_loader)
         print(f"Epoch: {epoch}, Loss: {train_loss}, Train Accuracy: {train_acc}")
+    test_acc = test(model, test_loader)
+    print(f"Test Accuracy: {test_acc}")