Spaces:

AlexSychovUN
/

BindingAffinityPrediction

Sleeping

App Files Files Community

AlexSychovUN commited on Dec 5, 2025

Commit

956b371

1 Parent(s): 71c437c

Added files

Browse files

Files changed (5) hide show

GNN_classification/Dataset_Preparation.py +69 -22
GNN_classification/model.py +5 -9
GNN_classification/training.py +5 -4
requirements.txt +5 -1
visualization.ipynb +155 -74

GNN_classification/Dataset_Preparation.py CHANGED Viewed

@@ -5,7 +5,7 @@ from rdkit import Chem, rdBase
 from torch_geometric.data import Data
 from torch.utils.data import Dataset
-rdBase.DisableLog('rdApp.*')
 def one_of_k_encoding(x, allowable_set):
@@ -16,35 +16,86 @@ def one_of_k_encoding(x, allowable_set):
 def get_atom_features(atom):
-    symbols_list = ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']
     degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     return np.array(
         # Type of atom (Symbol)
-        one_of_k_encoding(atom.GetSymbol(), symbols_list) +
         # Number of neighbours (Degree)
-        one_of_k_encoding(atom.GetDegree(), degrees_list) +
         # Number of hydrogen atoms (Implicit Hs) - bond donors
-        one_of_k_encoding(atom.GetTotalNumHs(), numhs_list) +
         # Valence - chemical potential
-        one_of_k_encoding(atom.GetImplicitValence(), implicit_valences_list) +
         # Hybridization - so important for 3d structure, sp2 - Trigonal planar, sp3 - Tetrahedral
-        one_of_k_encoding(atom.GetHybridization(), [
-            Chem.rdchem.HybridizationType.SP,
-            Chem.rdchem.HybridizationType.SP2,
-            Chem.rdchem.HybridizationType.SP3,
-            Chem.rdchem.HybridizationType.SP3D,
-            Chem.rdchem.HybridizationType.SP3D2,
-            'other']) +
         # Aromaticity (Boolean)
         [atom.GetIsAromatic()]
     )
 class SmilesDataset(Dataset):
     def __init__(self, dataframe):
         self.data = dataframe
@@ -58,13 +109,13 @@ class SmilesDataset(Dataset):
         label = row["label"]
         mol = Chem.MolFromSmiles(smiles)
-        if mol is None: return None
         # Nodes
         atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
         x = torch.tensor(np.array(atom_features), dtype=torch.float)
         # Edges
         edge_indexes = []
         for bond in mol.GetBonds():
@@ -78,7 +129,6 @@ class SmilesDataset(Dataset):
         edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
         # Label
         y = torch.tensor([label], dtype=torch.long)
         return Data(x=x, edge_index=edge_index, y=y)
@@ -98,8 +148,5 @@ if __name__ == "__main__":
     train_dataset = SmilesDataset(train_dataset)
     test_dataset = SmilesDataset(test_dataset)
     print(len(train_dataset))
     print(len(test_dataset))

 from torch_geometric.data import Data
 from torch.utils.data import Dataset
+rdBase.DisableLog("rdApp.*")
 def one_of_k_encoding(x, allowable_set):
 def get_atom_features(atom):
+    symbols_list = [
+        "C",
+        "N",
+        "O",
+        "S",
+        "F",
+        "Si",
+        "P",
+        "Cl",
+        "Br",
+        "Mg",
+        "Na",
+        "Ca",
+        "Fe",
+        "As",
+        "Al",
+        "I",
+        "B",
+        "V",
+        "K",
+        "Tl",
+        "Yb",
+        "Sb",
+        "Sn",
+        "Ag",
+        "Pd",
+        "Co",
+        "Se",
+        "Ti",
+        "Zn",
+        "H",
+        "Li",
+        "Ge",
+        "Cu",
+        "Au",
+        "Ni",
+        "Cd",
+        "In",
+        "Mn",
+        "Zr",
+        "Cr",
+        "Pt",
+        "Hg",
+        "Pb",
+        "Unknown",
+    ]
     degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     return np.array(
         # Type of atom (Symbol)
+        one_of_k_encoding(atom.GetSymbol(), symbols_list)
+        +
         # Number of neighbours (Degree)
+        one_of_k_encoding(atom.GetDegree(), degrees_list)
+        +
         # Number of hydrogen atoms (Implicit Hs) - bond donors
+        one_of_k_encoding(atom.GetTotalNumHs(), numhs_list)
+        +
         # Valence - chemical potential
+        one_of_k_encoding(atom.GetImplicitValence(), implicit_valences_list)
+        +
         # Hybridization - so important for 3d structure, sp2 - Trigonal planar, sp3 - Tetrahedral
+        one_of_k_encoding(
+            atom.GetHybridization(),
+            [
+                Chem.rdchem.HybridizationType.SP,
+                Chem.rdchem.HybridizationType.SP2,
+                Chem.rdchem.HybridizationType.SP3,
+                Chem.rdchem.HybridizationType.SP3D,
+                Chem.rdchem.HybridizationType.SP3D2,
+                "other",
+            ],
+        )
+        +
         # Aromaticity (Boolean)
         [atom.GetIsAromatic()]
     )
 class SmilesDataset(Dataset):
     def __init__(self, dataframe):
         self.data = dataframe
         label = row["label"]
         mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None
         # Nodes
         atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
         x = torch.tensor(np.array(atom_features), dtype=torch.float)
         # Edges
         edge_indexes = []
         for bond in mol.GetBonds():
         edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
         # Label
         y = torch.tensor([label], dtype=torch.long)
         return Data(x=x, edge_index=edge_index, y=y)
     train_dataset = SmilesDataset(train_dataset)
     test_dataset = SmilesDataset(test_dataset)
     print(len(train_dataset))
     print(len(test_dataset))

GNN_classification/model.py CHANGED Viewed

@@ -1,13 +1,9 @@
-import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import pandas as pd
-from rdkit import Chem
 from torch_geometric.nn import GCNConv, global_mean_pool
-from torch_geometric.data import Data
-from torch_geometric.loader import DataLoader
-from torch.utils.data import Dataset
 class GNNClassifier(nn.Module):
     def __init__(self, input_dim, output_dim, hidden_channels):
@@ -18,7 +14,7 @@ class GNNClassifier(nn.Module):
         self.conv2 = GCNConv(hidden_channels, hidden_channels)
         self.conv3 = GCNConv(hidden_channels, hidden_channels)
-        self.lin = nn.Linear(hidden_channels, output_dim) # classification task 0 or 1
     def forward(self, x, edge_index, batch):
         x = self.conv1(x, edge_index)
@@ -28,8 +24,8 @@ class GNNClassifier(nn.Module):
         x = self.conv3(x, edge_index)
         # Averaging nodes and got the molecula vector
-        x = global_mean_pool(x, batch) # [batch_size, hidden_channels]
         x = F.dropout(x, p=0.5, training=self.training)
         x = self.lin(x)
-        return x

 import torch.nn as nn
 import torch.nn.functional as F
 from torch_geometric.nn import GCNConv, global_mean_pool
 class GNNClassifier(nn.Module):
     def __init__(self, input_dim, output_dim, hidden_channels):
         self.conv2 = GCNConv(hidden_channels, hidden_channels)
         self.conv3 = GCNConv(hidden_channels, hidden_channels)
+        self.lin = nn.Linear(hidden_channels, output_dim)  # classification task 0 or 1
     def forward(self, x, edge_index, batch):
         x = self.conv1(x, edge_index)
         x = self.conv3(x, edge_index)
         # Averaging nodes and got the molecula vector
+        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
         x = F.dropout(x, p=0.5, training=self.training)
         x = self.lin(x)
+        return x

GNN_classification/training.py CHANGED Viewed

@@ -8,9 +8,10 @@ from torch_geometric.loader import DataLoader
 from Dataset_Preparation import SmilesDataset
 from model import GNNClassifier
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 print(DEVICE)
 def train(model, loader, optimizer, criterion):
     model.train()
     total_loss = 0
@@ -73,9 +74,9 @@ if __name__ == "__main__":
     train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
-    model = GNNClassifier(input_dim=num_node_features, output_dim=num_classes, hidden_channels=16).to(DEVICE)
     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
     criterion = torch.nn.CrossEntropyLoss()

 from Dataset_Preparation import SmilesDataset
 from model import GNNClassifier
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 print(DEVICE)
 def train(model, loader, optimizer, criterion):
     model.train()
     total_loss = 0
     train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
+    model = GNNClassifier(
+        input_dim=num_node_features, output_dim=num_classes, hidden_channels=16
+    ).to(DEVICE)
     optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
     criterion = torch.nn.CrossEntropyLoss()

requirements.txt CHANGED Viewed

@@ -1,4 +1,8 @@
 pandas
 rdkit
 biopython
-torch

+torch
+numpy
 pandas
 rdkit
 biopython

visualization.ipynb CHANGED Viewed

@@ -2,180 +2,261 @@
  "cells": [
   {
    "cell_type": "code",
-   "execution_count": 1,
    "id": "initial_id",
-   "metadata": {},
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
-       "model_id": "ccfa267dcd6945b6be10a9cbeffb4e5e",
        "version_major": 2,
-       "version_minor": 0
-      },
-      "text/plain": []
      },
      "metadata": {},
-     "output_type": "display_data"
     }
    ],
-   "source": [
-    "import nglview as nv\n",
-    "import os"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
    "id": "d8d7978e-980a-400c-8c6a-5365990c8855",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "PDBBIND_PATH = \"refined-set\""
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 3,
    "id": "788a6b43-c515-45c7-bc52-341d446b1a65",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "EXAMPLE_PDB_ID = \"1a1e\""
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 4,
    "id": "e8f4bebc-845f-43e8-bc4d-ab7b649eb49c",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "pdb_dir = os.path.join(PDBBIND_PATH, EXAMPLE_PDB_ID)"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 5,
    "id": "24b5e435-4d8f-4505-b27c-dd6317376ed4",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "protein_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_protein.pdb\")"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 6,
    "id": "e7fc3539-00c0-48a2-b012-c80757fa12c4",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "ligand_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_ligand.sdf\")"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
    "id": "9a053b99-7c01-4881-b3f7-e9b39090af9d",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "view = nv.NGLWidget()"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
    "id": "df8c8e00-3ce6-41dd-b457-d9f50e318dad",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "protein_comp = view.add_component(protein_file)"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 9,
    "id": "c191fead-fef8-4077-b787-5bf9552307b1",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "protein_comp.clear_representations()"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 10,
    "id": "4559033a-aeda-4659-8d91-9002b5a6ecda",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "protein_comp.add_representation('cartoon', color='blue')"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 11,
    "id": "73ea1a50-8463-40b8-a942-0c92d3e97a97",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "ligand_comp = view.add_component(ligand_file)"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 12,
    "id": "16cdb710-1ed6-4b1d-9e6a-69b7ad61a600",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "ligand_comp.clear_representations()"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 13,
    "id": "2193c497-f33c-4de0-86a9-6e535002fcb7",
-   "metadata": {},
-   "outputs": [],
    "source": [
     "ligand_comp.add_representation('ball+stick', radius=0.3)"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": 14,
    "id": "b1cc7f44-a374-4400-b4ba-8f75101b21ce",
-   "metadata": {},
    "outputs": [
     {
      "data": {
-      "application/vnd.jupyter.widget-view+json": {
-       "model_id": "6037e0edee3247a49cd586e52e64a61b",
-       "version_major": 2,
-       "version_minor": 0
-      },
       "text/plain": [
        "NGLWidget()"
-      ]
      },
      "metadata": {},
-     "output_type": "display_data"
     }
    ],
-   "source": [
-    "view"
-   ]
   },
   {
    "cell_type": "code",
-   "execution_count": null,
    "id": "5655e465-bb44-4218-a5e3-db2c5e62cd9c",
-   "metadata": {},
    "outputs": [],
-   "source": []
   }
  ],
  "metadata": {

  "cells": [
   {
    "cell_type": "code",
    "id": "initial_id",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.479196Z",
+     "start_time": "2025-12-05T14:02:00.003864Z"
+    }
+   },
+   "source": [
+    "import nglview as nv\n",
+    "import os"
+   ],
    "outputs": [
     {
      "data": {
+      "text/plain": [],
       "application/vnd.jupyter.widget-view+json": {
        "version_major": 2,
+       "version_minor": 0,
+       "model_id": "3016118bc02a458cbcb4491a27089a6a"
+      }
      },
      "metadata": {},
+     "output_type": "display_data",
+     "jetTransient": {
+      "display_id": null
+     }
     }
    ],
+   "execution_count": 1
   },
   {
    "cell_type": "code",
    "id": "d8d7978e-980a-400c-8c6a-5365990c8855",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.497753Z",
+     "start_time": "2025-12-05T14:02:00.493751Z"
+    }
+   },
    "source": [
     "PDBBIND_PATH = \"refined-set\""
+   ],
+   "outputs": [],
+   "execution_count": 2
   },
   {
    "cell_type": "code",
    "id": "788a6b43-c515-45c7-bc52-341d446b1a65",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.510747Z",
+     "start_time": "2025-12-05T14:02:00.505672Z"
+    }
+   },
    "source": [
     "EXAMPLE_PDB_ID = \"1a1e\""
+   ],
+   "outputs": [],
+   "execution_count": 3
   },
   {
    "cell_type": "code",
    "id": "e8f4bebc-845f-43e8-bc4d-ab7b649eb49c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.523669Z",
+     "start_time": "2025-12-05T14:02:00.518519Z"
+    }
+   },
    "source": [
     "pdb_dir = os.path.join(PDBBIND_PATH, EXAMPLE_PDB_ID)"
+   ],
+   "outputs": [],
+   "execution_count": 4
   },
   {
    "cell_type": "code",
    "id": "24b5e435-4d8f-4505-b27c-dd6317376ed4",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.570497Z",
+     "start_time": "2025-12-05T14:02:00.565454Z"
+    }
+   },
    "source": [
     "protein_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_protein.pdb\")"
+   ],
+   "outputs": [],
+   "execution_count": 5
   },
   {
    "cell_type": "code",
    "id": "e7fc3539-00c0-48a2-b012-c80757fa12c4",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.584673Z",
+     "start_time": "2025-12-05T14:02:00.578982Z"
+    }
+   },
    "source": [
     "ligand_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_ligand.sdf\")"
+   ],
+   "outputs": [],
+   "execution_count": 6
   },
   {
    "cell_type": "code",
    "id": "9a053b99-7c01-4881-b3f7-e9b39090af9d",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.649631Z",
+     "start_time": "2025-12-05T14:02:00.591897Z"
+    }
+   },
    "source": [
     "view = nv.NGLWidget()"
+   ],
+   "outputs": [],
+   "execution_count": 7
   },
   {
    "cell_type": "code",
    "id": "df8c8e00-3ce6-41dd-b457-d9f50e318dad",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.779528Z",
+     "start_time": "2025-12-05T14:02:00.657448Z"
+    }
+   },
    "source": [
     "protein_comp = view.add_component(protein_file)"
+   ],
+   "outputs": [],
+   "execution_count": 8
   },
   {
    "cell_type": "code",
    "id": "c191fead-fef8-4077-b787-5bf9552307b1",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.802894Z",
+     "start_time": "2025-12-05T14:02:00.795534Z"
+    }
+   },
    "source": [
     "protein_comp.clear_representations()"
+   ],
+   "outputs": [],
+   "execution_count": 9
   },
   {
    "cell_type": "code",
    "id": "4559033a-aeda-4659-8d91-9002b5a6ecda",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.824161Z",
+     "start_time": "2025-12-05T14:02:00.817622Z"
+    }
+   },
    "source": [
     "protein_comp.add_representation('cartoon', color='blue')"
+   ],
+   "outputs": [],
+   "execution_count": 10
   },
   {
    "cell_type": "code",
    "id": "73ea1a50-8463-40b8-a942-0c92d3e97a97",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.850013Z",
+     "start_time": "2025-12-05T14:02:00.840262Z"
+    }
+   },
    "source": [
     "ligand_comp = view.add_component(ligand_file)"
+   ],
+   "outputs": [],
+   "execution_count": 11
   },
   {
    "cell_type": "code",
    "id": "16cdb710-1ed6-4b1d-9e6a-69b7ad61a600",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.866184Z",
+     "start_time": "2025-12-05T14:02:00.859732Z"
+    }
+   },
    "source": [
     "ligand_comp.clear_representations()"
+   ],
+   "outputs": [],
+   "execution_count": 12
   },
   {
    "cell_type": "code",
    "id": "2193c497-f33c-4de0-86a9-6e535002fcb7",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.882846Z",
+     "start_time": "2025-12-05T14:02:00.876856Z"
+    }
+   },
    "source": [
     "ligand_comp.add_representation('ball+stick', radius=0.3)"
+   ],
+   "outputs": [],
+   "execution_count": 13
   },
   {
    "cell_type": "code",
    "id": "b1cc7f44-a374-4400-b4ba-8f75101b21ce",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.903573Z",
+     "start_time": "2025-12-05T14:02:00.897038Z"
+    }
+   },
+   "source": [
+    "view"
+   ],
    "outputs": [
     {
      "data": {
       "text/plain": [
        "NGLWidget()"
+      ],
+      "application/vnd.jupyter.widget-view+json": {
+       "version_major": 2,
+       "version_minor": 0,
+       "model_id": "028b8398377e4869a80fba4c3d5e5921"
+      }
      },
      "metadata": {},
+     "output_type": "display_data",
+     "jetTransient": {
+      "display_id": null
+     }
     }
    ],
+   "execution_count": 14
   },
   {
    "cell_type": "code",
    "id": "5655e465-bb44-4218-a5e3-db2c5e62cd9c",
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-05T14:02:00.915090Z",
+     "start_time": "2025-12-05T14:02:00.912563Z"
+    }
+   },
+   "source": [],
    "outputs": [],
+   "execution_count": null
   }
  ],
  "metadata": {