Spaces:

AlexSychovUN
/

BindingAffinityPrediction

Sleeping

App Files Files Community

AlexSychovUN commited on Dec 14, 2025

Commit

7e792a6

1 Parent(s): 229e134

Added files

Browse files

Files changed (4) hide show

EDA.ipynb +340 -0
dataset.py +167 -0
model.py +116 -0
train.py +70 -0

EDA.ipynb ADDED Viewed

	@@ -0,0 +1,340 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true,
+    "ExecuteTime": {
+     "end_time": "2025-12-14T17:07:51.424473Z",
+     "start_time": "2025-12-14T17:07:50.941145Z"
+    }
+   },
+   "source": "import pandas as pd",
+   "outputs": [],
+   "execution_count": 1
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-14T17:07:51.576127Z",
+     "start_time": "2025-12-14T17:07:51.545229Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "dataset = pd.read_csv(\"pdbbind_refined_dataset.csv\")",
+   "id": "5e25fb1118050711",
+   "outputs": [],
+   "execution_count": 2
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-14T17:07:55.548660Z",
+     "start_time": "2025-12-14T17:07:55.528324Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "dataset",
+   "id": "f1f5daab7e2df86d",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "     pdb_id                                             smiles  \\\n",
+       "0      2r58                   C[NH+](C)CCCC[C@H]([NH3+])C(=O)O   \n",
+       "1      3c2f  O=P(O)(O)OC[C@H]1O[C@H](O[P@](=O)(O)OP(=O)(O)O...   \n",
+       "2      3g2y                     CCc1c(C)[nH]n(-c2nnn[nH]2)c1=O   \n",
+       "3      3pce                                 O=C(O)Cc1cccc(O)c1   \n",
+       "4      4qsu                               C[C@H]1C=NC(=O)NC1=O   \n",
+       "...     ...                                                ...   \n",
+       "5311   4f3c  CCCCSC[C@H]1C[N@@H+](Cc2c[nH]c3c(N)ncnc23)C[C@...   \n",
+       "5312   5bry  C[NH2+][C@H]1CO[C@@H]2OC[C@H](OC(=O)N[C@@H](Cc...   \n",
+       "5313   1sl3  O=C(Cn1c(Cl)cnc(NCC(F)(F)c2cccc[n+]2[O-])c1=O)...   \n",
+       "5314   1ctu  O=C1N[C@H](O)C=CN1[C@@H]1O[C@H](CO)[C@@H](O)[C...   \n",
+       "5315   6e9a  COc1ccc(S(=O)(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc2c...   \n",
+       "\n",
+       "                                               sequence  affinity  \n",
+       "0     AFDWDAYLEETGSEAAPAKCFKQAQNPPNNDFKIGMKLEALDPRNV...      2.00  \n",
+       "1     PVYEHLLPVNGAWRQDVTNWLSEDVPSFDFGGYVVGSDLKEANLYC...      2.00  \n",
+       "2     TSAVQQKLAALEKSSGGRLGVALIDTADNTQVLYRGDERFPMCSTS...      2.00  \n",
+       "3     PAQDNSRFVIRDRNWHPKALTPDYKTSIARSPRQALVSIPQSISET...      2.00  \n",
+       "4     SMQEEDTFRELRIFLRNVTHRLAIDKRFRVFTKPVDPDEVPDYVTV...      2.00  \n",
+       "...                                                 ...       ...  \n",
+       "5311  MKIGIIGAMEEEVTLLRDKIDNRQTITLGGCEIYTGQLNGTEVALL...     11.82  \n",
+       "5312  PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKM...     11.82  \n",
+       "5313  DCGLRPLFEKKSLEDKTERELLESYIIVEGSDAEIGMSPWQVMLFR...     11.85  \n",
+       "5314  MHPRFQTAFAQLADNLQSALEPILADKYFPALLTGEQVSSLKSATG...     11.92  \n",
+       "5315  PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKM...     11.92  \n",
+       "\n",
+       "[5316 rows x 4 columns]"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>pdb_id</th>\n",
+       "      <th>smiles</th>\n",
+       "      <th>sequence</th>\n",
+       "      <th>affinity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>2r58</td>\n",
+       "      <td>C[NH+](C)CCCC[C@H]([NH3+])C(=O)O</td>\n",
+       "      <td>AFDWDAYLEETGSEAAPAKCFKQAQNPPNNDFKIGMKLEALDPRNV...</td>\n",
+       "      <td>2.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>3c2f</td>\n",
+       "      <td>O=P(O)(O)OC[C@H]1O[C@H](O[P@](=O)(O)OP(=O)(O)O...</td>\n",
+       "      <td>PVYEHLLPVNGAWRQDVTNWLSEDVPSFDFGGYVVGSDLKEANLYC...</td>\n",
+       "      <td>2.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>3g2y</td>\n",
+       "      <td>CCc1c(C)[nH]n(-c2nnn[nH]2)c1=O</td>\n",
+       "      <td>TSAVQQKLAALEKSSGGRLGVALIDTADNTQVLYRGDERFPMCSTS...</td>\n",
+       "      <td>2.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>3pce</td>\n",
+       "      <td>O=C(O)Cc1cccc(O)c1</td>\n",
+       "      <td>PAQDNSRFVIRDRNWHPKALTPDYKTSIARSPRQALVSIPQSISET...</td>\n",
+       "      <td>2.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>4qsu</td>\n",
+       "      <td>C[C@H]1C=NC(=O)NC1=O</td>\n",
+       "      <td>SMQEEDTFRELRIFLRNVTHRLAIDKRFRVFTKPVDPDEVPDYVTV...</td>\n",
+       "      <td>2.00</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5311</th>\n",
+       "      <td>4f3c</td>\n",
+       "      <td>CCCCSC[C@H]1C[N@@H+](Cc2c[nH]c3c(N)ncnc23)C[C@...</td>\n",
+       "      <td>MKIGIIGAMEEEVTLLRDKIDNRQTITLGGCEIYTGQLNGTEVALL...</td>\n",
+       "      <td>11.82</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5312</th>\n",
+       "      <td>5bry</td>\n",
+       "      <td>C[NH2+][C@H]1CO[C@@H]2OC[C@H](OC(=O)N[C@@H](Cc...</td>\n",
+       "      <td>PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKM...</td>\n",
+       "      <td>11.82</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5313</th>\n",
+       "      <td>1sl3</td>\n",
+       "      <td>O=C(Cn1c(Cl)cnc(NCC(F)(F)c2cccc[n+]2[O-])c1=O)...</td>\n",
+       "      <td>DCGLRPLFEKKSLEDKTERELLESYIIVEGSDAEIGMSPWQVMLFR...</td>\n",
+       "      <td>11.85</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5314</th>\n",
+       "      <td>1ctu</td>\n",
+       "      <td>O=C1N[C@H](O)C=CN1[C@@H]1O[C@H](CO)[C@@H](O)[C...</td>\n",
+       "      <td>MHPRFQTAFAQLADNLQSALEPILADKYFPALLTGEQVSSLKSATG...</td>\n",
+       "      <td>11.92</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>5315</th>\n",
+       "      <td>6e9a</td>\n",
+       "      <td>COc1ccc(S(=O)(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc2c...</td>\n",
+       "      <td>PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKM...</td>\n",
+       "      <td>11.92</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5316 rows × 4 columns</p>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 3
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-14T17:08:03.034505Z",
+     "start_time": "2025-12-14T17:08:03.021615Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "dataset.describe()",
+   "id": "3607df642b3b7c29",
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "          affinity\n",
+       "count  5316.000000\n",
+       "mean      6.392466\n",
+       "std       1.952730\n",
+       "min       2.000000\n",
+       "25%       4.930000\n",
+       "50%       6.420000\n",
+       "75%       7.740000\n",
+       "max      11.920000"
+      ],
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>affinity</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>count</th>\n",
+       "      <td>5316.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>mean</th>\n",
+       "      <td>6.392466</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>std</th>\n",
+       "      <td>1.952730</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>min</th>\n",
+       "      <td>2.000000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>25%</th>\n",
+       "      <td>4.930000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>50%</th>\n",
+       "      <td>6.420000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>75%</th>\n",
+       "      <td>7.740000</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>max</th>\n",
+       "      <td>11.920000</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "execution_count": 4
+  },
+  {
+   "metadata": {
+    "ExecuteTime": {
+     "end_time": "2025-12-14T17:08:23.940133Z",
+     "start_time": "2025-12-14T17:08:23.925955Z"
+    }
+   },
+   "cell_type": "code",
+   "source": "dataset.info()",
+   "id": "c747950337bfdc2b",
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "<class 'pandas.core.frame.DataFrame'>\n",
+      "RangeIndex: 5316 entries, 0 to 5315\n",
+      "Data columns (total 4 columns):\n",
+      " #   Column    Non-Null Count  Dtype  \n",
+      "---  ------    --------------  -----  \n",
+      " 0   pdb_id    5316 non-null   object \n",
+      " 1   smiles    5314 non-null   object \n",
+      " 2   sequence  5316 non-null   object \n",
+      " 3   affinity  5316 non-null   float64\n",
+      "dtypes: float64(1), object(3)\n",
+      "memory usage: 166.3+ KB\n"
+     ]
+    }
+   ],
+   "execution_count": 5
+  },
+  {
+   "metadata": {},
+   "cell_type": "code",
+   "outputs": [],
+   "execution_count": null,
+   "source": "",
+   "id": "28375d839fb776f6"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

dataset.py ADDED Viewed

	@@ -0,0 +1,167 @@

+import numpy as np
+import torch
+import pandas as pd
+from rdkit import Chem, rdBase
+from torch_geometric.data import Data
+from torch.utils.data import Dataset, random_split
+rdBase.DisableLog("rdApp.*")
+def one_of_k_encoding(x, allowable_set):
+    # last position - unknown
+    if x not in allowable_set:
+        x = allowable_set[-1]
+    return list(map(lambda s: x == s, allowable_set))
+def get_atom_features(atom):
+    symbols_list = [
+        "C",
+        "N",
+        "O",
+        "S",
+        "F",
+        "Si",
+        "P",
+        "Cl",
+        "Br",
+        "Mg",
+        "Na",
+        "Ca",
+        "Fe",
+        "As",
+        "Al",
+        "I",
+        "B",
+        "V",
+        "K",
+        "Tl",
+        "Yb",
+        "Sb",
+        "Sn",
+        "Ag",
+        "Pd",
+        "Co",
+        "Se",
+        "Ti",
+        "Zn",
+        "H",
+        "Li",
+        "Ge",
+        "Cu",
+        "Au",
+        "Ni",
+        "Cd",
+        "In",
+        "Mn",
+        "Zr",
+        "Cr",
+        "Pt",
+        "Hg",
+        "Pb",
+        "Unknown",
+    ]
+    degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    return np.array(
+        # Type of atom (Symbol)
+        one_of_k_encoding(atom.GetSymbol(), symbols_list)
+        +
+        # Number of neighbours (Degree)
+        one_of_k_encoding(atom.GetDegree(), degrees_list)
+        +
+        # Number of hydrogen atoms (Implicit Hs) - bond donors
+        one_of_k_encoding(atom.GetTotalNumHs(), numhs_list)
+        +
+        # Valence - chemical potential
+        one_of_k_encoding(atom.GetImplicitValence(), implicit_valences_list)
+        +
+        # Hybridization - so important for 3d structure, sp2 - Trigonal planar, sp3 - Tetrahedral
+        one_of_k_encoding(
+            atom.GetHybridization(),
+            [
+                Chem.rdchem.HybridizationType.SP,
+                Chem.rdchem.HybridizationType.SP2,
+                Chem.rdchem.HybridizationType.SP3,
+                Chem.rdchem.HybridizationType.SP3D,
+                Chem.rdchem.HybridizationType.SP3D2,
+                "other",
+            ],
+        )
+        +
+        # Aromaticity (Boolean)
+        [atom.GetIsAromatic()]
+    )
+def get_protein_features(char):
+    prot_vocab= {
+            'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5, 'Q': 6, 'E': 7, 'G': 8, 'H': 9,
+            'I': 10, 'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15, 'S': 16, 'T': 17,
+            'W': 18, 'Y': 19, 'V': 20, 'X': 21, 'Z': 21, 'B': 21,
+            'PAD': 0, 'UNK': 21
+        }
+    return prot_vocab.get(char, prot_vocab['UNK'])
+class BindingDataset(Dataset):
+    def __init__(self, dataframe, max_seq_length=1000):
+        self.data = dataframe
+        self.max_seq_length = max_seq_length  # Define a maximum sequence length for padding/truncation
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        row = self.data.iloc[idx]
+        smiles = row["smiles"]
+        sequence = row["sequence"]
+        affinity = row["affinity"]
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None
+        # Ligand (Graph)
+        # Nodes
+        atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
+        x = torch.tensor(np.array(atom_features), dtype=torch.float)
+        # Edges
+        edge_indexes = []
+        for bond in mol.GetBonds():
+            i = bond.GetBeginAtomIdx()
+            j = bond.GetEndAtomIdx()
+            edge_indexes.append((i, j))
+            edge_indexes.append((j, i))
+        # t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
+        # contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially
+        edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
+        # Protein (Sequence, tensor of integers)
+        tokens = [get_protein_features(char) for char in sequence]
+        if len(tokens) > self.max_seq_length:
+            tokens = tokens[:self.max_seq_length]
+        else:
+            tokens.extend([get_protein_features("PAD")] * (self.max_seq_length - len(tokens)))
+        protein_tensor = torch.tensor(tokens, dtype=torch.long)
+        # Affinity
+        y = torch.tensor([affinity], dtype=torch.float)
+        return Data(x=x, edge_index=edge_index, protein_seq=protein_tensor, y=y)
+if __name__ == "__main__":
+    dataset = pd.read_csv("pdbbind_refined_dataset.csv")
+    dataset = BindingDataset(dataset)
+    train_size = int(0.8 * len(dataset))
+    test_size = len(dataset) - train_size
+    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
+    print(len(train_dataset))
+    print(len(test_dataset))

model.py ADDED Viewed

	@@ -0,0 +1,116 @@

+import math
+import torch
+import torch.nn as nn
+from torch_geometric.nn import GCNConv, global_mean_pool
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, seq_len: int = 5000, dropout: float = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.seq_len = seq_len
+        self.dropout = nn.Dropout(dropout)
+        # Create a matrix of shape (seq_len, d_model)
+        pe = torch.zeros(seq_len, d_model)
+        # Create a vector of shape (seq_len, 1)
+        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(
+            1
+        )  # (Seq_len, 1)
+        # Compute the positional encodings once in log space.
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
+        # Apply the sin to even positions
+        pe[:, 0::2] = torch.sin(position * div_term)
+        # Apply the cos to odd positions
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)  # (1, Seq_len, d_model) batch dimension
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        # x: [batch_size, seq_len, d_model]
+        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
+        return self.dropout(x)
+class LigandGNN(nn.Module):
+    def __init__(self, input_dim, hidden_channels):
+        super().__init__()
+        self.hidden_channels = hidden_channels
+        self.conv1 = GCNConv(input_dim, hidden_channels)
+        self.conv2 = GCNConv(hidden_channels, hidden_channels)
+        self.conv3 = GCNConv(hidden_channels, hidden_channels)
+        self.dropout = nn.Dropout(0.2)
+    def forward(self, x, edge_index, batch):
+        x = self.conv1(x, edge_index)
+        x = x.relu()
+        x = self.dropout(x)
+        x = self.conv2(x, edge_index)
+        x = x.relu()
+        x = self.conv3(x, edge_index)
+        x = self.dropout(x)
+        # Averaging nodes and got the molecula vector
+        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
+        return x
+class ProteinTransformer(nn.Module):
+    def __init__(self, vocab_size, d_model=128, N=2, h=4, output_dim=128):
+        super().__init__()
+        self.d_model = d_model
+        self.embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoder = PositionalEncoding(d_model)
+        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=h, batch_first=True)
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N)
+        self.fc = nn.Linear(d_model, output_dim)
+    def forward(self, x):
+        # x: [batch_size, seq_len]
+        padding_mask = (x == 0) # mask for PAD tokens
+        x = self.embedding(x) * math.sqrt(self.d_model)
+        x = self.pos_encoder(x)
+        x = self.transformer(x, src_key_padding_mask=padding_mask)
+        mask = (~padding_mask).float().unsqueeze(-1)
+        x = x * mask
+        sum_x = x.sum(dim=1)  # Global average pooling
+        token_counts = mask.sum(dim=1).clamp(min=1e-9)
+        x = sum_x / token_counts
+        x = self.fc(x)
+        return x
+class BindingAffinityModel(nn.Module):
+    def __init__(self, num_node_features, hidden_channels_gnn):
+        super().__init__()
+        # Tower 1 - Ligand GNN
+        self.ligand_gnn = LigandGNN(input_dim=num_node_features, hidden_channels=hidden_channels_gnn)
+        # Tower 2 - Protein Transformer
+        self.protein_transformer = ProteinTransformer(vocab_size=26)
+        self.head = nn.Sequential(
+            nn.Linear(128 + 128, 256),
+            nn.ReLU(),
+            nn.Dropout(0.2),
+            nn.Linear(256, 1),
+        )
+    def forward(self, x, edge_index, batch, protein_seq):
+        ligand_vec = self.ligand_gnn(x, edge_index, batch)
+        batch_size = batch.max().item() + 1
+        protein_seq = protein_seq.view(batch_size, -1)
+        protein_vec = self.protein_transformer(protein_seq)
+        combined = torch.cat([ligand_vec, protein_vec], dim=1)
+        return self.head(combined)

train.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import torch.nn as nn
+import pandas as pd
+from torch.utils.data import random_split
+from torch_geometric.loader import DataLoader
+from dataset import BindingDataset
+from model import BindingAffinityModel
+from tqdm import tqdm
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+def train_epoch(model, loader, optimizer, criterion):
+    model.train()
+    total_loss = 0
+    for batch in tqdm(loader, desc="Training"):
+        batch = batch.to(DEVICE)
+        optimizer.zero_grad()
+        out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+        loss = criterion(out.squeeze(), batch.y.squeeze())
+        loss.backward()
+        optimizer.step()
+        total_loss += loss.item()
+    return total_loss / len(loader)
+def evaluate(model, loader, criterion):
+    model.eval()
+    total_loss = 0
+    with torch.no_grad():
+        for batch in loader:
+            batch = batch.to(DEVICE)
+            out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+            loss = criterion(out.squeeze(), batch.y.squeeze())
+            total_loss += loss.item()
+    return total_loss / len(loader)
+def main():
+    # Load dataset
+    dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
+    dataframe.dropna(inplace=True)
+    print("Dataset loaded with {} samples".format(len(dataframe)))
+    dataset = BindingDataset(dataframe)
+    print("Dataset transformed with {} samples".format(len(dataset)))
+    train_size = int(0.8 * len(dataset))
+    test_size = len(dataset) - train_size
+    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
+    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
+    num_features = train_dataset[0].x.shape[1]
+    print("Number of node features:", num_features)
+    model = BindingAffinityModel(num_node_features=num_features, hidden_channels_gnn=128).to(DEVICE)
+    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
+    criterion = nn.MSELoss()
+    num_epochs = 20
+    for epoch in range(num_epochs):
+        train_loss = train_epoch(model, train_loader, optimizer, criterion)
+        test_loss = evaluate(model, test_loader, criterion)
+        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')
+    torch.save(model.state_dict(), './model.pth')
+if __name__ == "__main__":
+    main()