Spaces:

AlexSychovUN
/

BindingAffinityPrediction

Sleeping

App Files Files Community

AlexSychovUN commited on Dec 3, 2025

Commit

da7c0f0

0 Parent(s):

Added files

Browse files

Files changed (12) hide show

.gitignore +2 -0
GNN_classification/Dataset_Preparation.py +64 -0
GNN_classification/dataset/classification/data_test.txt +0 -0
GNN_classification/dataset/classification/data_train.txt +0 -0
GNN_classification/model.py +35 -0
GNN_classification/training.py +86 -0
GNNs__practice.ipynb +0 -0
dataset_preparation.py +110 -0
pdbbind_refined_dataset.csv +0 -0
requirements.txt +4 -0
transformer_from_scratch/model.py +65 -0
visualization.ipynb +202 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ .idea
2	+ .ipynb_checkpoints

GNN_classification/Dataset_Preparation.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch
+import pandas as pd
+from rdkit import Chem
+from torch_geometric.data import Data
+from torch.utils.data import Dataset
+class SmilesDataset(Dataset):
+    def __init__(self, dataframe):
+        self.data = dataframe
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        row = self.data.iloc[idx]
+        smiles = row["smiles"]
+        label = row["label"]
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None: return None
+        # Nodes
+        atom_features = [[atom.GetAtomicNum()] for atom in mol.GetAtoms()]
+        x = torch.tensor(atom_features, dtype=torch.float)
+        # Edges
+        edge_indexes = []
+        for bond in mol.GetBonds():
+            i = bond.GetBeginAtomIdx()
+            j = bond.GetEndAtomIdx()
+            edge_indexes.append((i, j))
+            edge_indexes.append((j, i))
+        # t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
+        # contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially
+        if not edge_indexes:
+            edge_index = torch.empty((2, 0), dtype=torch.long)
+        else:
+            edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
+        # Label
+        y = torch.tensor([label], dtype=torch.long)
+        return Data(x=x, edge_index=edge_index, y=y)
+if __name__ == "__main__":
+    columns = ["smiles", "label"]
+    train_dataset = pd.read_csv(
+        "dataset/classification/data_train.txt", sep=" ", header=None, names=columns
+    )
+    test_dataset = pd.read_csv(
+        "dataset/classification/data_test.txt", sep=" ", header=None, names=columns
+    )
+    train_dataset = SmilesDataset(train_dataset)
+    test_dataset = SmilesDataset(test_dataset)
+    print(len(train_dataset))
+    print(len(test_dataset))

GNN_classification/dataset/classification/data_test.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

GNN_classification/dataset/classification/data_train.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

GNN_classification/model.py ADDED Viewed

	@@ -0,0 +1,35 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pandas as pd
+from rdkit import Chem
+from torch_geometric.nn import GCNConv, global_mean_pool
+from torch_geometric.data import Data
+from torch_geometric.loader import DataLoader
+from torch.utils.data import Dataset
+class GNNClassifier(nn.Module):
+    def __init__(self, input_dim, output_dim, hidden_channels):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.conv1 = GCNConv(input_dim, hidden_channels)
+        self.conv2 = GCNConv(hidden_channels, hidden_channels)
+        self.conv3 = GCNConv(hidden_channels, hidden_channels)
+        self.lin = nn.Linear(hidden_channels, output_dim) # classification task 0 or 1
+    def forward(self, x, edge_index, batch):
+        x = self.conv1(x, edge_index)
+        x = x.relu()
+        x = self.conv2(x, edge_index)
+        x = x.relu()
+        x = self.conv3(x, edge_index)
+        # Averaging nodes and got the molecula vector
+        x = global_mean_pool(x, batch) # [batch_size, hidden_channels]
+        x = F.dropout(x, p=0.5, training=self.training)
+        x = self.lin(x)
+        return x

GNN_classification/training.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import pandas as pd
+from rdkit import Chem
+from torch_geometric.loader import DataLoader
+from Dataset_Preparation import SmilesDataset
+from model import GNNClassifier
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(DEVICE)
+def train(model, loader, optimizer, criterion):
+    model.train()
+    total_loss = 0
+    for batch in loader:
+        batch = batch.to(DEVICE)
+        optimizer.zero_grad()
+        out = model(batch.x, batch.edge_index, batch.batch)
+        loss = criterion(out, batch.y)
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    return total_loss / len(loader)
+def test(model, loader):
+    model.eval()
+    correct = 0
+    with torch.no_grad():
+        for batch in loader:
+            batch = batch.to(DEVICE)
+            out = model(batch.x, batch.edge_index, batch.batch)
+            pred = out.argmax(dim=1)
+            correct += (pred == batch.y).sum().item()
+    acc = correct / len(loader.dataset)
+    return acc
+if __name__ == "__main__":
+    columns = ["smiles", "label"]
+    train_dataset = pd.read_csv(
+        "dataset/classification/data_train.txt", sep=" ", header=None, names=columns
+    )
+    test_dataset = pd.read_csv(
+        "dataset/classification/data_test.txt", sep=" ", header=None, names=columns
+    )
+    train_dataset = SmilesDataset(train_dataset)
+    test_dataset = SmilesDataset(test_dataset)
+    num_node_features = train_dataset[0].x.shape[1]
+    num_classes = 2
+    print(f"Train samples: {len(train_dataset)}")
+    print(f"Test samples: {len(test_dataset)}")
+    print(f"Node features: {num_node_features}")
+    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)
+    model = GNNClassifier(input_dim=1, output_dim=2, hidden_channels=16).to(DEVICE)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
+    criterion = torch.nn.CrossEntropyLoss()
+    EPOCHS = 20
+    print("Start Training")
+    for epoch in range(1, EPOCHS + 1):
+        train_loss = train_epoch(model, train_loader, optimizer, criterion)
+        train_acc = evaluate(model, train_loader)
+        print(f"Epoch: {epoch}, Loss: {train_loss}, Train Accuracy: {train_acc}")

GNNs__practice.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

dataset_preparation.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import os
+import pandas as pd
+from Bio.PDB import PDBParser
+from Bio.SeqUtils import seq1
+from Bio.PDB.Polypeptide import is_aa
+from rdkit import Chem
+from tqdm import tqdm
+PDBBIND_PATH = "refined-set"
+INDEX_NAME = "INDEX_refined_data.2020"
+def get_ligand_smiles(pdb_id, pdb_dir_path):
+    """
+    Get the SMILES representation of the ligand.
+    """
+    sdf_path = os.path.join(pdb_dir_path, f"{pdb_id}_ligand.sdf")
+    mol2_path = os.path.join(pdb_dir_path, f"{pdb_id}_ligand.mol2")
+    if os.path.exists(sdf_path):
+        try:
+            sfd_file = Chem.SDMolSupplier(sdf_path)
+            if sfd_file:
+                mol = sfd_file[0]
+        except Exception:
+            mol = None
+    if mol is None and os.path.exists(mol2_path):
+        try:
+            mol = Chem.MolFromMol2File(mol2_path)
+        except Exception:
+            mol = None
+    if mol is not None:
+        smiles = Chem.MolToSmiles(mol)
+        return smiles
+    else:
+        return None
+def get_protein_sequence(pdb_id, pdb_dir_path):
+    """
+    Get the protein sequence of the protein.
+    """
+    protein_path = os.path.join(pdb_dir_path, f"{pdb_id}_protein.pdb")
+    pdbparser = PDBParser()
+    structure = pdbparser.get_structure(pdb_id, protein_path)
+    sequences = []
+    for model in structure:
+        for chain in model:
+            sequence = ""
+            for residue in chain:
+                if residue.get_id()[0] == " " and is_aa(
+                    residue.get_resname(), standard=True
+                ):
+                    sequence += seq1(residue.get_resname())
+            sequences.append(sequence)
+    longest_sequence = max(sequences, key=len)
+    return longest_sequence
+def main():
+    final_data = []
+    index_data = {}
+    index_file_path = os.path.join(PDBBIND_PATH, "index", INDEX_NAME)
+    with open(index_file_path, "r") as f:
+        for line in f:
+            if line.startswith("#"):
+                continue
+            parts = line.split()
+            pdb_id = parts[0]
+            print(pdb_id)
+            affinity = parts[3]
+            index_data[pdb_id] = affinity
+    print(f"Loaded index data for {len(index_data)} entries")
+    for pdb_id, affinity in tqdm(index_data.items()):
+        pdb_id_path = os.path.join(PDBBIND_PATH, pdb_id)
+        smiles = get_ligand_smiles(pdb_id, pdb_id_path)
+        sequence = get_protein_sequence(pdb_id, pdb_id_path)
+        if smiles is not None or sequence is not None:
+            final_data.append(
+                {
+                    "pdb_id": pdb_id,
+                    "smiles": smiles,
+                    "sequence": sequence,
+                    "affinity": affinity,
+                }
+            )
+    df = pd.DataFrame(final_data)
+    df.to_csv("pdbbind_refined_dataset.csv", index=False)
+# pdb_id = "1a1e"
+# PDF_ID_PATH = os.path.join(PDBBIND_PATH, pdb_id)
+#
+# smiles = get_ligand_smiles(pdb_id, PDF_ID_PATH)
+# print(smiles)
+#
+# sequence = get_protein_sequence(pdb_id, PDF_ID_PATH)
+# print(sequence)
+if __name__ == "__main__":
+    main()

pdbbind_refined_dataset.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+pandas
+rdkit
+biopython
+torch

transformer_from_scratch/model.py ADDED Viewed

	@@ -0,0 +1,65 @@

+import math
+import torch
+import torch.nn as nn
+class InputEmbeddings(nn.Module):
+    def __init__(self, d_model: int, vocab_size: int):
+        super().__init__()
+        self.d_model = d_model
+        self.vocab_size = vocab_size
+        self.embedding = nn.Embedding(vocab_size, d_model)  # vocab_size -> 512
+    def forward(self, x):
+        return self.embedding(x) * math.sqrt(self.d_model)
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, seq_len: int, dropout: float):
+        super().__init__()
+        self.d_model = d_model
+        self.seq_len = seq_len
+        self.dropout = nn.Dropout(dropout)
+        # Create a matrix of shape (seq_len, d_model)
+        pe = torch.zeros(seq_len, d_model)
+        # Create a vector of shape (seq_len, 1)
+        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(
+            1
+        )  # (Seq_len, 1)
+        # Compute the positional encodings once in log space.
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
+        # Apply the sin to even positions
+        pe[:, 0::2] = torch.sin(position * div_term)
+        # Apply the cos to odd positions
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)  # (1, Seq_len, d_model) batch dimension
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        x = x + (self.pe[:, : x.shape[1], :]).requires_grad_(False)
+        return self.dropout(x)
+class LayerNormalization(nn.Module):
+    def __init__(self, eps: float = 10e-6) -> None:
+        super().__init__()
+        self.eps = eps  # avoid division by zero and huge numbers
+        self.alpha = nn.Parameter(torch.ones(1))  # Multiplied
+        self.bias = nn.Parameter(torch.zeros(1))  # Added
+    def forward(self, x):
+        mean = x.mean(dim=-1, keepdim=True)  # To every sample
+        std = x.std(dim=-1, keepdim=True)
+        return self.alpha * (x - mean) / (std + self.eps) + self.bias
+class FeedForwardBlock(nn.Module):
+    def __init__(self, d_model: int, d_ff: int, dropout: float):
+        super().__init__()
+        self.linear1 = nn.Linear(d_model, d_ff)

visualization.ipynb ADDED Viewed

	@@ -0,0 +1,202 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "initial_id",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "ccfa267dcd6945b6be10a9cbeffb4e5e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": []
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "import nglview as nv\n",
+    "import os"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "d8d7978e-980a-400c-8c6a-5365990c8855",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PDBBIND_PATH = \"refined-set\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "788a6b43-c515-45c7-bc52-341d446b1a65",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "EXAMPLE_PDB_ID = \"1a1e\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "e8f4bebc-845f-43e8-bc4d-ab7b649eb49c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pdb_dir = os.path.join(PDBBIND_PATH, EXAMPLE_PDB_ID)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "24b5e435-4d8f-4505-b27c-dd6317376ed4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "protein_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_protein.pdb\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "e7fc3539-00c0-48a2-b012-c80757fa12c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ligand_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_ligand.sdf\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "9a053b99-7c01-4881-b3f7-e9b39090af9d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view = nv.NGLWidget()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "df8c8e00-3ce6-41dd-b457-d9f50e318dad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "protein_comp = view.add_component(protein_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "c191fead-fef8-4077-b787-5bf9552307b1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "protein_comp.clear_representations()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "4559033a-aeda-4659-8d91-9002b5a6ecda",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "protein_comp.add_representation('cartoon', color='blue')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "73ea1a50-8463-40b8-a942-0c92d3e97a97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ligand_comp = view.add_component(ligand_file)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "16cdb710-1ed6-4b1d-9e6a-69b7ad61a600",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ligand_comp.clear_representations()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "2193c497-f33c-4de0-86a9-6e535002fcb7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "ligand_comp.add_representation('ball+stick', radius=0.3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "b1cc7f44-a374-4400-b4ba-8f75101b21ce",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "6037e0edee3247a49cd586e52e64a61b",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "NGLWidget()"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "view"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5655e465-bb44-4218-a5e3-db2c5e62cd9c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}