Spaces:

AlexSychovUN
/

BindingAffinityPrediction

Sleeping

App Files Files Community

AlexSychovUN commited on Jan 22

Commit

2fdd454

1 Parent(s): 6afa7ea

Updated code

Browse files

Files changed (7) hide show

inference.py +85 -0
model.py +45 -19
optuna_train.py +79 -0
requirements.txt +2 -0
train.py +56 -16
transformer_from_scratch/attention_visual.ipynb +0 -0
transformer_from_scratch/model.py +1 -1

inference.py ADDED Viewed

	@@ -0,0 +1,85 @@

+import random
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from torch_geometric.loader import DataLoader
+from dataset import BindingDataset
+from model import BindingAffinityModel
+from tqdm import tqdm
+from scipy.stats import pearsonr
+from torch.utils.data import random_split
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+MODEL_PATH = "best_model_gat.pth"
+def set_seed(seed=42):
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    return torch.Generator().manual_seed(seed)
+def predict_and_plot():
+    gen = set_seed(42)
+    print("Loading data...")
+    dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
+    dataframe.dropna(inplace=True)
+    dataset = BindingDataset(dataframe)
+    if len(dataset) == 0:
+        print("Dataset is empty")
+        return
+    train_size = int(0.8 * len(dataset))
+    test_size = len(dataset) - train_size
+    _, test_dataset = random_split(dataset, [train_size, test_size], generator=gen)
+    loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
+    num_features = test_dataset[0].x.shape[1]
+    print("Loading model...")
+    model = BindingAffinityModel(num_node_features=num_features, hidden_channels_gnn=128).to(DEVICE)
+    model.load_state_dict(torch.load(MODEL_PATH))
+    model.eval()
+    y_true = []
+    y_pred = []
+    print("Predicting...")
+    with torch.no_grad():
+        for batch in tqdm(loader):
+            batch = batch.to(DEVICE)
+            out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+            y_true.extend(batch.y.cpu().numpy())
+            y_pred.extend(out.squeeze().cpu().numpy())
+    y_true = np.array(y_true)
+    y_pred = np.array(y_pred)
+    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
+    mae = np.mean(np.abs(y_true - y_pred))
+    pearson_corr, _ = pearsonr(y_true, y_pred)  # Pearson correlation
+    print("Results:")
+    print(f"RMSE: {rmse:.4f}")
+    print(f"MAE: {mae:.4f}")
+    print(f"Pearson Correlation: {pearson_corr:.4f}")
+    plt.figure(figsize=(9, 9))
+    plt.scatter(y_true, y_pred, alpha=0.4, s=15, c='blue', label='Predictions')
+    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', linestyle='--', linewidth=2,
+             label='Ideal')
+    plt.xlabel('Experimental Affinity (pK)')
+    plt.ylabel('Predicted Affinity (pK)')
+    plt.title(f'Binding affinity Results\nRMSE={rmse:.3f}, Pearson R={pearson_corr:.3f}')
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plot_file = 'final_results_gat.png'
+    plt.savefig(plot_file)
+    print(f"График сохранен в {plot_file}")
+    plt.show()
+if __name__ == "__main__":
+    predict_and_plot()

model.py CHANGED Viewed

@@ -4,7 +4,7 @@ import torch
 import torch.nn as nn
-from torch_geometric.nn import GCNConv, global_mean_pool
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model: int, seq_len: int = 5000, dropout: float = 0.1):
@@ -39,15 +39,40 @@ class PositionalEncoding(nn.Module):
 class LigandGNN(nn.Module):
-    def __init__(self, input_dim, hidden_channels):
         super().__init__()
-        self.hidden_channels = hidden_channels
-        self.conv1 = GCNConv(input_dim, hidden_channels)
-        self.conv2 = GCNConv(hidden_channels, hidden_channels)
-        self.conv3 = GCNConv(hidden_channels, hidden_channels)
-        self.dropout = nn.Dropout(0.2)
     def forward(self, x, edge_index, batch):
         x = self.conv1(x, edge_index)
@@ -56,19 +81,20 @@ class LigandGNN(nn.Module):
         x = self.conv2(x, edge_index)
         x = x.relu()
-        x = self.conv3(x, edge_index)
         x = self.dropout(x)
-        # Averaging nodes and got the molecula vector
-        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
         return x
 class ProteinTransformer(nn.Module):
-    def __init__(self, vocab_size, d_model=128, N=2, h=4, output_dim=128):
         super().__init__()
         self.d_model = d_model
         self.embedding = nn.Embedding(vocab_size, d_model)
-        self.pos_encoder = PositionalEncoding(d_model)
         encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=h, batch_first=True)
         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N)
@@ -91,18 +117,18 @@ class ProteinTransformer(nn.Module):
         return x
 class BindingAffinityModel(nn.Module):
-    def __init__(self, num_node_features, hidden_channels_gnn):
         super().__init__()
         # Tower 1 - Ligand GNN
-        self.ligand_gnn = LigandGNN(input_dim=num_node_features, hidden_channels=hidden_channels_gnn)
         # Tower 2 - Protein Transformer
-        self.protein_transformer = ProteinTransformer(vocab_size=26)
         self.head = nn.Sequential(
-            nn.Linear(128 + 128, 256),
             nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(256, 1),
         )
     def forward(self, x, edge_index, batch, protein_seq):
         ligand_vec = self.ligand_gnn(x, edge_index, batch)

 import torch.nn as nn
+from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model: int, seq_len: int = 5000, dropout: float = 0.1):
+# class LigandGNN(nn.Module): # GCN CONV
+#     def __init__(self, input_dim, hidden_channels):
+#         super().__init__()
+#         self.hidden_channels = hidden_channels
+#
+#         self.conv1 = GCNConv(input_dim, hidden_channels)
+#         self.conv2 = GCNConv(hidden_channels, hidden_channels)
+#         self.conv3 = GCNConv(hidden_channels, hidden_channels)
+#         self.dropout = nn.Dropout(0.2)
+#
+#     def forward(self, x, edge_index, batch):
+#         x = self.conv1(x, edge_index)
+#         x = x.relu()
+#         x = self.dropout(x)
+#
+#         x = self.conv2(x, edge_index)
+#         x = x.relu()
+#         x = self.conv3(x, edge_index)
+#         x = self.dropout(x)
+#
+#         # Averaging nodes and got the molecula vector
+#         x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
+#         return x
 class LigandGNN(nn.Module):
+    def __init__(self, input_dim, hidden_channels, heads=4, dropout=0.2):
         super().__init__()
+        # Heads=4 means we use 4 attention heads
+        # Concat=False, we average the heads instead of concatenating them, to keep the output dimension same as hidden_channels
+        self.conv1 = GATConv(input_dim, hidden_channels, heads=heads, concat=False)
+        self.conv2 = GATConv(hidden_channels, hidden_channels, heads=heads, concat=False)
+        self.conv3 = GATConv(hidden_channels, hidden_channels, heads=heads, concat=False)
+        self.dropout = nn.Dropout(dropout)
     def forward(self, x, edge_index, batch):
         x = self.conv1(x, edge_index)
         x = self.conv2(x, edge_index)
         x = x.relu()
         x = self.dropout(x)
+        x = self.conv3(x, edge_index)
+        # Global Mean Pooling
+        x = global_mean_pool(x, batch)
         return x
 class ProteinTransformer(nn.Module):
+    def __init__(self, vocab_size, d_model=128, N=2, h=4, output_dim=128, dropout=0.2):
         super().__init__()
         self.d_model = d_model
         self.embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoder = PositionalEncoding(d_model, dropout)
         encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=h, batch_first=True)
         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N)
         return x
 class BindingAffinityModel(nn.Module):
+    def __init__(self, num_node_features, hidden_channels=128, gat_heads=4, dropout=0.2):
         super().__init__()
         # Tower 1 - Ligand GNN
+        self.ligand_gnn = LigandGNN(input_dim=num_node_features, hidden_channels=hidden_channels, heads=gat_heads, dropout=dropout)
         # Tower 2 - Protein Transformer
+        self.protein_transformer = ProteinTransformer(vocab_size=26, d_model=hidden_channels, output_dim=hidden_channels, dropout=dropout)
         self.head = nn.Sequential(
+            nn.Linear(hidden_channels*2, hidden_channels),
             nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_channels, 1),
         )
     def forward(self, x, edge_index, batch, protein_seq):
         ligand_vec = self.ligand_gnn(x, edge_index, batch)

optuna_train.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import torch
+import torch.nn as nn
+import pandas as pd
+import optuna
+from torch.nn.functional import dropout
+from torch.utils.data import random_split
+from torch_geometric.loader import DataLoader
+from dataset import BindingDataset
+from model import BindingAffinityModel
+from tqdm import tqdm
+import sys
+DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+EPOCHS_PER_TRIAL = 10
+dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
+dataframe.dropna(inplace=True)
+dataset = BindingDataset(dataframe)
+train_size = int(0.8 * len(dataset))
+test_size = len(dataset) - train_size
+train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
+num_features = train_dataset[0].x.shape[1]
+def train(model, loader, optimizer, criterion):
+    model.train()
+    for batch in loader:
+        batch = batch.to(DEVICE)
+        optimizer.zero_grad()
+        out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+        loss = criterion(out.squeeze(), batch.y.squeeze())
+        loss.backward()
+        optimizer.step()
+def test(model, loader, criterion):
+    model.eval()
+    total_loss = 0
+    with torch.no_grad():
+        for batch in loader:
+            batch = batch.to(DEVICE)
+            out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+            loss = criterion(out.squeeze(), batch.y.squeeze())
+            total_loss += loss.item()
+    return total_loss / len(loader)
+def objective(trial):
+    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True) # Learning rate from 0.00001 to 0.01
+    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True) # Weight decay from 0.000001 to 0.001
+    model = BindingAffinityModel(num_node_features=num_features, hidden_channels_gnn=128).to(DEVICE)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+    criterion = nn.MSELoss()
+    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
+    for epoch in range(EPOCHS_PER_TRIAL):
+        train(model, train_loader, optimizer, criterion)
+        val_loss = test(model, test_loader, criterion)
+        trial.report(val_loss, epoch)
+        if trial.should_prune():
+            raise optuna.exceptions.TrialPruned()
+    return val_loss
+if __name__ == "__main__":
+    study = optuna.create_study(direction="minimize")
+    print("Start hyperparameter optimization...")
+    study.optimize(objective, n_trials=10)
+    print("\n--- Optimization Finished ---")
+    print("Best parameters found: ", study.best_params)
+    print("Best Test MSE: ", study.best_value)
+    df_results = study.trials_dataframe()
+    df_results.to_csv("optuna_results.csv")

requirements.txt CHANGED Viewed

@@ -1,4 +1,6 @@
 torch
 numpy
 pandas

 torch
+pytorch-lightning
+optuna
 numpy
 pandas

train.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import torch
 import torch.nn as nn
 import pandas as pd
@@ -6,15 +8,30 @@ from torch_geometric.loader import DataLoader
 from dataset import BindingDataset
 from model import BindingAffinityModel
 from tqdm import tqdm
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-def train_epoch(epoch, model, loader, optimizer, criterion):
     model.train()
     total_loss = 0
-    for batch in tqdm(loader, desc=f"Training epoch: {epoch}"):
         batch = batch.to(DEVICE)
         optimizer.zero_grad()
@@ -23,21 +40,35 @@ def train_epoch(epoch, model, loader, optimizer, criterion):
         loss.backward()
         optimizer.step()
-        total_loss += loss.item()
-    return total_loss / len(loader)
-def evaluate(epoch, model, loader, criterion):
     model.eval()
     total_loss = 0
     with torch.no_grad():
-        for batch in tqdm(loader, desc=f"Evaluating epoch: {epoch}"):
             batch = batch.to(DEVICE)
             out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
             loss = criterion(out.squeeze(), batch.y.squeeze())
             total_loss += loss.item()
-    return total_loss / len(loader)
 def main():
     # Load dataset
     dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
     dataframe.dropna(inplace=True)
@@ -52,10 +83,10 @@ def main():
     train_size = int(0.8 * len(dataset))
     test_size = len(dataset) - train_size
-    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
-    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
-    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
     num_features = train_dataset[0].x.shape[1]
     print("Number of node features:", num_features)
@@ -63,13 +94,22 @@ def main():
     optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
     criterion = nn.MSELoss()
-    num_epochs = 20
     print(f"Starting training on {DEVICE}")
-    for epoch in range(num_epochs):
-        train_loss = train_epoch(epoch, model, train_loader, optimizer, criterion)
-        test_loss = evaluate(epoch, model, test_loader, criterion)
-        print(f'Epoch {epoch+1}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')
-    torch.save(model.state_dict(), './model.pth')
 if __name__ == "__main__":

+import random
 import torch
 import torch.nn as nn
 import pandas as pd
 from dataset import BindingDataset
 from model import BindingAffinityModel
 from tqdm import tqdm
+from torch.utils.tensorboard import SummaryWriter
+import numpy as np
+from datetime import datetime
 DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+BATCH_SIZE = 32
+LR = 0.0005
+EPOCS = 30
+LOG_DIR = f"runs/experiment_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+def set_seed(seed=42):
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    return torch.Generator().manual_seed(seed)
+def train_epoch(epoch, model, loader, optimizer, criterion, writer):
     model.train()
     total_loss = 0
+    loop = tqdm(loader, desc=f"Training epoch: {epoch}", leave=False)
+    for i, batch in enumerate(loop):
         batch = batch.to(DEVICE)
         optimizer.zero_grad()
         loss.backward()
         optimizer.step()
+        current_loss = loss.item()
+        total_loss += current_loss
+        global_step = (epoch - 1) * len(loader) + i
+        writer.add_scalar('Loss/Train_Step', current_loss, global_step)
+        loop.set_postfix(loss = loss.item())
+    avg_loss = total_loss / len(loader)
+    return avg_loss
+def evaluate(epoch, model, loader, criterion, writer):
     model.eval()
     total_loss = 0
     with torch.no_grad():
+        for batch in tqdm(loader, desc=f"Evaluating epoch: {epoch}", leave=False):
             batch = batch.to(DEVICE)
             out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
             loss = criterion(out.squeeze(), batch.y.squeeze())
             total_loss += loss.item()
+    avg_loss = total_loss / len(loader)
+    writer.add_scalar('Loss/Test', avg_loss, epoch)
+    return avg_loss
 def main():
+    gen = set_seed(42)
+    writer = SummaryWriter(LOG_DIR)
+    print(f"Logging to {LOG_DIR}...")
     # Load dataset
     dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
     dataframe.dropna(inplace=True)
     train_size = int(0.8 * len(dataset))
     test_size = len(dataset) - train_size
+    train_dataset, test_dataset = random_split(dataset, [train_size, test_size], generator=gen)
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
     num_features = train_dataset[0].x.shape[1]
     print("Number of node features:", num_features)
     optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay=1e-4)
     criterion = nn.MSELoss()
+    best_test_loss = float('inf')
     print(f"Starting training on {DEVICE}")
+    for epoch in range(1, EPOCS):
+        train_loss = train_epoch(epoch, model, train_loader, optimizer, criterion, writer)
+        test_loss = evaluate(epoch, model, test_loader, criterion, writer)
+        print(f'Epoch {epoch:02d}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')
+        if test_loss < best_test_loss:
+            best_test_loss = test_loss
+            torch.save(model.state_dict(), f'best_model_gat.pth')
+            print(f'Best model saved with Test Loss MSE: {best_test_loss:.4f}')
+    writer.close()
+    print("Training finished.")
 if __name__ == "__main__":

transformer_from_scratch/attention_visual.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

transformer_from_scratch/model.py CHANGED Viewed

@@ -119,7 +119,7 @@ class MultiHeadAttention(nn.Module):
             1, 2
         )
-        x, attention_scores = MultiHeadAttention.attention(
             query, key, value, mask, self.dropout
         )

             1, 2
         )
+        x, self.attention_scores = MultiHeadAttention.attention(
             query, key, value, mask, self.dropout
         )