Spaces:

AlexSychovUN
/

BindingAffinityPrediction

Sleeping

App Files Files Community

AlexSychovUN commited on Jan 24

Commit

e33b6c9

1 Parent(s): 1390640

Updated all code

Browse files

Files changed (25) hide show

.gitignore +7 -1
GNN_classification/dataset/classification/EDA.ipynb +3 -1
GNNs__practice.ipynb +0 -0
all_inferences.py +225 -0
dataset.py +53 -11
inference.py +29 -12
inference_attention.py +102 -0
main.py +79 -0
model.py +31 -15
model_attention.py +143 -0
model_pl.py +6 -6
optuna_train.py +36 -9
optuna_train_attention.py +132 -0
templates/index.html +103 -0
train.py +43 -21
train_attention.py +180 -0
train_pl.py +10 -8
transformer_from_scratch/attention_visual.ipynb +62 -14
transformer_from_scratch/config.py +4 -3
transformer_from_scratch/dataset.py +45 -19
transformer_from_scratch/inference.ipynb +23 -8
transformer_from_scratch/train.py +137 -69
transformer_from_scratch/translate.py +83 -26
utils.py +308 -0
visualization.ipynb +91 -97

.gitignore CHANGED Viewed

	@@ -1 +1,7 @@
1	- .idea

+.idea
+.venv
+.ipynb_checkpoints
+/refined-set/
+/data
+/lightning_logs

GNN_classification/dataset/classification/EDA.ipynb CHANGED Viewed

@@ -126,7 +126,9 @@
     }
    },
    "cell_type": "code",
-   "source": "train_dataset['label'].value_counts()",
    "id": "355c3ed8e5f76bbf",
    "outputs": [
     {

     }
    },
    "cell_type": "code",
+   "source": [
+    "train_dataset[\"label\"].value_counts()"
+   ],
    "id": "355c3ed8e5f76bbf",
    "outputs": [
     {

GNNs__practice.ipynb CHANGED Viewed

The diff for this file is too large to render. See raw diff

all_inferences.py ADDED Viewed

	@@ -0,0 +1,225 @@

+import datetime
+import os.path
+import torch
+import numpy as np
+from torch_geometric.data import Data, Batch
+from rdkit import Chem
+from rdkit.Chem import AllChem
+import nglview as nv
+import py3Dmol
+from nglview import write_html
+import matplotlib
+import matplotlib.cm as cm
+import matplotlib.colors as mcolors
+from dataset import get_atom_features, get_protein_features
+from model_attention import BindingAffinityModel
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_PATH = "runs/experiment_attention20260124_104439_optuna/models/model_ep041_mse1.9153.pth"
+GAT_HEADS = 2
+HIDDEN_CHANNELS = 256
+def get_inference_data(ligand_smiles, protein_sequence, model_path):
+    """
+    Returns:
+        - mol: RDKit molecule object with 3D coordinates
+        - importance: list of importance scores for each atom
+        - predicted_affinity: predicted binding affinity value
+    """
+    # Prepare ligand molecule with geometry RDKit
+    mol = Chem.MolFromSmiles(ligand_smiles)
+    mol = Chem.AddHs(mol)
+    AllChem.EmbedMolecule(mol, randomSeed=42)
+    # Graph data PyTorch
+    atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
+    x = torch.tensor(np.array(atom_features), dtype=torch.float)
+    edge_index = []
+    for bond in mol.GetBonds():
+        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+        edge_index.extend([(i, j), (j, i)])
+    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
+    tokens = [get_protein_features(c) for c in protein_sequence]
+    if len(tokens) > 1200: tokens = tokens[:1200]
+    else: tokens.extend([0] * (1200 - len(tokens)))
+    protein_sequence = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(DEVICE)
+    data = Data(x=x, edge_index=edge_index)
+    batch = Batch.from_data_list([data]).to(DEVICE)
+    num_features = x.shape[1]
+    # Model loading
+    model = BindingAffinityModel(num_features, hidden_channels=HIDDEN_CHANNELS, gat_heads=GAT_HEADS).to(DEVICE)
+    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+    model.eval()
+    # Prediction
+    with torch.no_grad():
+        pred = model(batch.x, batch.edge_index, batch.batch, protein_sequence)
+        attention_weights = model.cross_attention.last_attention_weights[0]
+    # Attention importance, Max + Normalize
+    real_prot_len = len([t for t in tokens if t != 0])
+    importance = attention_weights[:, :real_prot_len].max(dim=1).values.cpu().numpy()
+    # Normalize to [0, 1]
+    if importance.max() > 0:
+        importance = (importance - importance.min()) / (importance.max() - importance.min())
+    # Noise reduction
+    importance[importance < 0.01] = 0
+    return mol, importance, pred.item()
+def print_atom_scores(mol, importance):
+    print("Atom importance scores:")
+    atom_data = []
+    for i, score in enumerate(importance):
+        if score > 0.1:
+            symbol = mol.GetAtomWithIdx(i).GetSymbol()
+            atom_data.append((i, symbol, score))
+    atom_data.sort(key=lambda x: x[2], reverse=True)
+    for idx, symbol, score in atom_data:
+        fire = "🔥" if score > 0.8 else ("✨" if score > 0.5 else "")
+        print(f"Atom {idx} ({symbol}): Importance = {score:.3f} {fire}")
+def get_py3dmol(mol, importance, score):
+    view = py3Dmol.view(width=1000, height=800)
+    view.addModel(Chem.MolToMolBlock(mol), "sdf")
+    view.setBackgroundColor('white')
+    # 1. БАЗОВЫЙ СТИЛЬ (ГРУНТОВКА)
+    # Задаем единый размер для всей молекулы сразу
+    # scale: 0.25 — оптимальный средний размер
+    view.setStyle({}, {
+        'stick': {'color': '#cccccc', 'radius': 0.1},
+        'sphere': {'color': '#cccccc', 'scale': 0.25}
+    })
+    red_atoms = []
+    orange_atoms = []
+    blue_atoms = []
+    indices_sorted = np.argsort(importance)[::-1]
+    top_indices = set(indices_sorted[:15])
+    labels_to_add = []
+    conf = mol.GetConformer()
+    # 2. СОРТИРОВКА (ТОЛЬКО ЦВЕТА)
+    for i, val in enumerate(importance):
+        if val >= 0.70:
+            red_atoms.append(i)
+        elif val >= 0.55:
+            orange_atoms.append(i)
+        elif val >= 0.40:
+            blue_atoms.append(i)
+        if i in top_indices and val > 0.1:
+            pos = conf.GetAtomPosition(i)
+            symbol = mol.GetAtomWithIdx(i).GetSymbol()
+            labels_to_add.append({
+                'text': f"{i}:{symbol}:{val:.2f}",
+                'pos': {'x': pos.x, 'y': pos.y, 'z': pos.z}
+            })
+    # 3. ПРИМЕНЕНИЕ СТИЛЕЙ
+    # Обрати внимание: scale везде 0.25 (или 0.28, чтобы чуть выделить цветные)
+    # Мы меняем ТОЛЬКО ЦВЕТ.
+    if red_atoms:
+        view.addStyle({'serial': red_atoms}, {
+            'sphere': {'color': '#FF0000', 'scale': 0.28},
+            'stick': {'color': '#FF0000', 'radius': 0.12}
+        })
+    if orange_atoms:
+        view.addStyle({'serial': orange_atoms}, {
+            'sphere': {'color': '#FF8C00', 'scale': 0.28},
+            'stick': {'color': '#FF8C00', 'radius': 0.12}
+        })
+    if blue_atoms:
+        view.addStyle({'serial': blue_atoms}, {
+            'sphere': {'color': '#7777FF', 'scale': 0.28}
+        })
+    # 4. МЕТКИ
+    for label in labels_to_add:
+        view.addLabel(label['text'], {
+            'position': label['pos'],
+            'fontSize': 14,
+            'fontColor': 'white',
+            'backgroundColor': 'black',
+            'backgroundOpacity': 0.7,
+            'borderThickness': 0,
+            'inFront': True,
+            'showBackground': True
+        })
+    view.zoomTo()
+    view.addLabel(f"Predicted pKd: {float(score):.2f}",
+                  {'position': {'x': -5, 'y': 10, 'z': 0}, 'backgroundColor': 'black', 'fontColor': 'white'})
+    return view
+def get_ngl(mol, importance):
+    pdb_temp = Chem.MolToPDBBlock(mol)
+    mol_pdb = Chem.MolFromPDBBlock(pdb_temp, removeHs=False)
+    for i, atom in enumerate(mol_pdb.GetAtoms()):
+        info = atom.GetPDBResidueInfo()
+        if info:
+            val = float(importance[i] * 100.0)
+            info.SetTempFactor(val)
+    final_pdb_block = Chem.MolToPDBBlock(mol_pdb)
+    structure = nv.TextStructure(final_pdb_block, ext="pdb")
+    view = nv.NGLWidget(structure)
+    view.clear_representations()
+    view.add_representation('ball+stick', colorScheme='bfactor', colorScale=['blue', 'white', 'red'], colorDomain=[10, 80], radiusScale=1.0)
+    indices_sorted = np.argsort(importance)[::-1]
+    top_indices = indices_sorted[:15]
+    selection_str = "@" + ",".join(map(str, top_indices))
+    view.add_representation('label',
+                            selection=selection_str,  # Подписываем только избранных
+                            labelType='atomindex',  # Показывать Индекс (0, 1, 2...)
+                            color='black',  # Черный текст
+                            radius=2.0,  # Размер шрифта (попробуйте 1.5 - 3.0)
+                            zOffset=1.0)  # Чуть сдвинуть к камере
+    view.center()
+    return view
+if __name__ == "__main__":
+    smiles = "COc1ccc(S(=O)(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc2ccccc2)NC(=O)O[C@@H]2C[C@@H]3NC(=O)O[C@@H]3C2)cc1"
+    protein = "PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKMIGGIGGFIKVRQYDQIIIEIAGHKAIGTVLVGPTPVNIIGRNLLTQIGATLNF"
+    affinity = 11.92
+    file_name_py3dmol = "html_results/py3dmol_result.html"
+    file_name_ngl = "html_results/ngl_result.html"
+    mol, importance, score = get_inference_data(smiles, protein, MODEL_PATH)
+    print_atom_scores(mol, importance)
+    py3dmol_view = get_py3dmol(mol, importance, score)
+    py3dmol_view.write_html(file_name_py3dmol)
+    ngl_widget = get_ngl(mol, importance)
+    nv.write_html(file_name_ngl, ngl_widget)

dataset.py CHANGED Viewed

@@ -65,6 +65,14 @@ def get_atom_features(atom):
     degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     return np.array(
         # Type of atom (Symbol)
         one_of_k_encoding(atom.GetSymbol(), symbols_list)
@@ -93,22 +101,55 @@ def get_atom_features(atom):
         +
         # Aromaticity (Boolean)
         [atom.GetIsAromatic()]
     )
 def get_protein_features(char):
-    prot_vocab= {
-            'A': 1, 'R': 2, 'N': 3, 'D': 4, 'C': 5, 'Q': 6, 'E': 7, 'G': 8, 'H': 9,
-            'I': 10, 'L': 11, 'K': 12, 'M': 13, 'F': 14, 'P': 15, 'S': 16, 'T': 17,
-            'W': 18, 'Y': 19, 'V': 20, 'X': 21, 'Z': 21, 'B': 21,
-            'PAD': 0, 'UNK': 21
-        }
-    return prot_vocab.get(char, prot_vocab['UNK'])
 class BindingDataset(Dataset):
     def __init__(self, dataframe, max_seq_length=1000):
         self.data = dataframe
-        self.max_seq_length = max_seq_length  # Define a maximum sequence length for padding/truncation
     def __len__(self):
         return len(self.data)
@@ -144,9 +185,11 @@ class BindingDataset(Dataset):
         # Protein (Sequence, tensor of integers)
         tokens = [get_protein_features(char) for char in sequence]
         if len(tokens) > self.max_seq_length:
-            tokens = tokens[:self.max_seq_length]
         else:
-            tokens.extend([get_protein_features("PAD")] * (self.max_seq_length - len(tokens)))
         protein_tensor = torch.tensor(tokens, dtype=torch.long)
         # Affinity
@@ -164,4 +207,3 @@ if __name__ == "__main__":
     print(len(train_dataset))
     print(len(test_dataset))

     degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
     implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    formal_charge_list = [-2, -1, 0, 1, 2]
+    chirality_list = [
+        Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
+        Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+        Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
+        Chem.rdchem.ChiralType.CHI_OTHER,
+    ]
     return np.array(
         # Type of atom (Symbol)
         one_of_k_encoding(atom.GetSymbol(), symbols_list)
         +
         # Aromaticity (Boolean)
         [atom.GetIsAromatic()]
+        +
+        # Formal Charge
+        one_of_k_encoding(atom.GetFormalCharge(), formal_charge_list)
+        +
+        # Chirality (Geometry)
+        one_of_k_encoding(atom.GetChiralTag(), chirality_list)
+        +
+        # Is in ring (Boolean)
+        [atom.IsInRing()]
     )
 def get_protein_features(char):
+    prot_vocab = {
+        "A": 1,
+        "R": 2,
+        "N": 3,
+        "D": 4,
+        "C": 5,
+        "Q": 6,
+        "E": 7,
+        "G": 8,
+        "H": 9,
+        "I": 10,
+        "L": 11,
+        "K": 12,
+        "M": 13,
+        "F": 14,
+        "P": 15,
+        "S": 16,
+        "T": 17,
+        "W": 18,
+        "Y": 19,
+        "V": 20,
+        "X": 21,
+        "Z": 21,
+        "B": 21,
+        "PAD": 0,
+        "UNK": 21,
+    }
+    return prot_vocab.get(char, prot_vocab["UNK"])
 class BindingDataset(Dataset):
     def __init__(self, dataframe, max_seq_length=1000):
         self.data = dataframe
+        self.max_seq_length = (
+            max_seq_length  # Define a maximum sequence length for padding/truncation
+        )
     def __len__(self):
         return len(self.data)
         # Protein (Sequence, tensor of integers)
         tokens = [get_protein_features(char) for char in sequence]
         if len(tokens) > self.max_seq_length:
+            tokens = tokens[: self.max_seq_length]
         else:
+            tokens.extend(
+                [get_protein_features("PAD")] * (self.max_seq_length - len(tokens))
+            )
         protein_tensor = torch.tensor(tokens, dtype=torch.long)
         # Affinity
     print(len(train_dataset))
     print(len(test_dataset))

inference.py CHANGED Viewed

@@ -10,9 +10,11 @@ from model import BindingAffinityModel
 from tqdm import tqdm
 from scipy.stats import pearsonr
 from torch.utils.data import random_split
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-MODEL_PATH = "best_model_gat.pth"
 def set_seed(seed=42):
     random.seed(seed)
@@ -21,11 +23,12 @@ def set_seed(seed=42):
     np.random.seed(seed)
     return torch.Generator().manual_seed(seed)
 def predict_and_plot():
     gen = set_seed(42)
     print("Loading data...")
-    dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
     dataframe.dropna(inplace=True)
     dataset = BindingDataset(dataframe)
     if len(dataset) == 0:
@@ -40,7 +43,12 @@ def predict_and_plot():
     num_features = test_dataset[0].x.shape[1]
     print("Loading model...")
-    model = BindingAffinityModel(num_node_features=num_features, hidden_channels_gnn=128).to(DEVICE)
     model.load_state_dict(torch.load(MODEL_PATH))
     model.eval()
@@ -67,19 +75,28 @@ def predict_and_plot():
     print(f"Pearson Correlation: {pearson_corr:.4f}")
     plt.figure(figsize=(9, 9))
-    plt.scatter(y_true, y_pred, alpha=0.4, s=15, c='blue', label='Predictions')
-    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='red', linestyle='--', linewidth=2,
-             label='Ideal')
-    plt.xlabel('Experimental Affinity (pK)')
-    plt.ylabel('Predicted Affinity (pK)')
-    plt.title(f'Binding affinity Results\nRMSE={rmse:.3f}, Pearson R={pearson_corr:.3f}')
     plt.legend()
     plt.grid(True, alpha=0.3)
-    plot_file = 'final_results_gat.png'
     plt.savefig(plot_file)
     print(f"График сохранен в {plot_file}")
     plt.show()
 if __name__ == "__main__":
-    predict_and_plot()

 from tqdm import tqdm
 from scipy.stats import pearsonr
 from torch.utils.data import random_split
+from train import GAT_HEADS, DROPOUT, HIDDEN_CHANNELS
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_PATH = "runs/experiment_20260122_230138_GAT_without_formal_charge_chirality_ring_scheduler/models/model_ep092_mse2.3805.pth"
 def set_seed(seed=42):
     random.seed(seed)
     np.random.seed(seed)
     return torch.Generator().manual_seed(seed)
 def predict_and_plot():
     gen = set_seed(42)
     print("Loading data...")
+    dataframe = pd.read_csv("pdbbind_refined_dataset.csv")
     dataframe.dropna(inplace=True)
     dataset = BindingDataset(dataframe)
     if len(dataset) == 0:
     num_features = test_dataset[0].x.shape[1]
     print("Loading model...")
+    model = BindingAffinityModel(
+        num_node_features=num_features,
+        hidden_channels=HIDDEN_CHANNELS,
+        gat_heads=GAT_HEADS,
+        dropout=DROPOUT,
+    ).to(DEVICE)
     model.load_state_dict(torch.load(MODEL_PATH))
     model.eval()
     print(f"Pearson Correlation: {pearson_corr:.4f}")
     plt.figure(figsize=(9, 9))
+    plt.scatter(y_true, y_pred, alpha=0.4, s=15, c="blue", label="Predictions")
+    plt.plot(
+        [min(y_true), max(y_true)],
+        [min(y_true), max(y_true)],
+        color="red",
+        linestyle="--",
+        linewidth=2,
+        label="Ideal",
+    )
+    plt.xlabel("Experimental Affinity (pK)")
+    plt.ylabel("Predicted Affinity (pK)")
+    plt.title(
+        f"Binding affinity Results\nRMSE={rmse:.3f}, Pearson R={pearson_corr:.3f}"
+    )
     plt.legend()
     plt.grid(True, alpha=0.3)
+    plot_file = "final_results_gat.png"
     plt.savefig(plot_file)
     print(f"График сохранен в {plot_file}")
     plt.show()
 if __name__ == "__main__":
+    predict_and_plot()

inference_attention.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import random
+import torch
+import pandas as pd
+import matplotlib.pyplot as plt
+import numpy as np
+from torch_geometric.loader import DataLoader
+from dataset import BindingDataset
+from model_attention import BindingAffinityModel
+from tqdm import tqdm
+from scipy.stats import pearsonr
+from torch.utils.data import random_split
+from train_attention import GAT_HEADS, DROPOUT, HIDDEN_CHANNELS
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_PATH = "runs/experiment_attention20260123_103840_with_additional_data_scheduler/models/model_ep032_mse2.0264.pth"
+def set_seed(seed=42):
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    return torch.Generator().manual_seed(seed)
+def predict_and_plot():
+    gen = set_seed(42)
+    print("Loading data...")
+    dataframe = pd.read_csv("pdbbind_refined_dataset.csv")
+    dataframe.dropna(inplace=True)
+    dataset = BindingDataset(dataframe)
+    if len(dataset) == 0:
+        print("Dataset is empty")
+        return
+    train_size = int(0.8 * len(dataset))
+    test_size = len(dataset) - train_size
+    _, test_dataset = random_split(dataset, [train_size, test_size], generator=gen)
+    loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
+    num_features = test_dataset[0].x.shape[1]
+    print("Loading model...")
+    model = BindingAffinityModel(
+        num_node_features=num_features,
+        hidden_channels=HIDDEN_CHANNELS,
+        gat_heads=GAT_HEADS,
+        dropout=DROPOUT,
+    ).to(DEVICE)
+    model.load_state_dict(torch.load(MODEL_PATH))
+    model.eval()
+    y_true = []
+    y_pred = []
+    print("Predicting...")
+    with torch.no_grad():
+        for batch in tqdm(loader):
+            batch = batch.to(DEVICE)
+            out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+            y_true.extend(batch.y.cpu().numpy())
+            y_pred.extend(out.squeeze().cpu().numpy())
+    y_true = np.array(y_true)
+    y_pred = np.array(y_pred)
+    rmse = np.sqrt(np.mean((y_true - y_pred) ** 2))
+    mae = np.mean(np.abs(y_true - y_pred))
+    pearson_corr, _ = pearsonr(y_true, y_pred)  # Pearson correlation
+    print("Results:")
+    print(f"RMSE: {rmse:.4f}")
+    print(f"MAE: {mae:.4f}")
+    print(f"Pearson Correlation: {pearson_corr:.4f}")
+    plt.figure(figsize=(9, 9))
+    plt.scatter(y_true, y_pred, alpha=0.4, s=15, c="blue", label="Predictions")
+    plt.plot(
+        [min(y_true), max(y_true)],
+        [min(y_true), max(y_true)],
+        color="red",
+        linestyle="--",
+        linewidth=2,
+        label="Ideal",
+    )
+    plt.xlabel("Experimental Affinity (pK)")
+    plt.ylabel("Predicted Affinity (pK)")
+    plt.title(
+        f"Binding affinity Results\nRMSE={rmse:.3f}, Pearson R={pearson_corr:.3f}"
+    )
+    plt.legend()
+    plt.grid(True, alpha=0.3)
+    plot_file = "final_results_gat.png"
+    plt.savefig(plot_file)
+    print(f"График сохранен в {plot_file}")
+    plt.show()
+if __name__ == "__main__":
+    predict_and_plot()

main.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import os
+import uuid
+from fastapi import FastAPI, Request, Form
+from fastapi.templating import Jinja2Templates
+from fastapi.staticfiles import StaticFiles
+from fastapi.responses import HTMLResponse
+from utils import get_inference_data, get_py3dmol_view,save_standalone_ngl_html
+import nglview as nv
+app = FastAPI()
+os.makedirs("html_results", exist_ok=True)
+app.mount("/results", StaticFiles(directory="html_results"), name="results")
+templates = Jinja2Templates(directory="templates")
+@app.get("/", response_class=HTMLResponse)
+async def read_root(request: Request):
+    return templates.TemplateResponse("index.html", {"request": request})
+@app.post("/predict", response_class=HTMLResponse)
+async def predict(
+    request: Request,
+    smiles_ligand: str = Form(...),
+    sequence_protein: str = Form(...)
+):
+    mol, importance, affinity = get_inference_data(smiles_ligand, sequence_protein)
+    atom_list = []
+    sorted_indices = sorted(range(len(importance)), key=lambda k: importance[k], reverse=True)
+    for idx in sorted_indices[:15]:
+        val = importance[idx]
+        symbol = mol.GetAtomWithIdx(idx).GetSymbol()
+        icon = ""
+        if val >= 0.9: icon = "🔥"
+        elif val >= 0.7: icon = "✨"
+        elif val >= 0.5: icon = "⭐"
+        atom_list.append({
+            "id": idx,
+            "symbol": symbol,
+            "score": f"{val:.3f}",
+            "icon": icon
+        })
+    unique_id = str(uuid.uuid4())
+    filename_ngl = f"ngl_{unique_id}.html"
+    filepath_ngl = os.path.join("html_results", filename_ngl)
+    py3dmol_view = get_py3dmol_view(mol, importance)
+    py3dmol_content = py3dmol_view._make_html()
+    # ngl_view = get_ngl_view(mol, importance)
+    # nv.write_html(filepath_ngl, ngl_view)
+    save_standalone_ngl_html(mol, importance, filepath_ngl)
+    ngl_url_link = f"/results/{filename_ngl}"
+    return templates.TemplateResponse("index.html", {
+        "request": request,
+        "result_ready": True,
+        "smiles": smiles_ligand,
+        "protein": sequence_protein,
+        "affinity": f"{affinity:.2f}",
+        "atom_list": atom_list,
+        "html_py3dmol": py3dmol_content,
+        "url_ngl": ngl_url_link
+    })

model.py CHANGED Viewed

@@ -1,11 +1,9 @@
 import math
 import torch
 import torch.nn as nn
 from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model: int, seq_len: int = 5000, dropout: float = 0.1):
         super().__init__()
@@ -34,11 +32,10 @@ class PositionalEncoding(nn.Module):
     def forward(self, x):
         # x: [batch_size, seq_len, d_model]
-        x = x + (self.pe[:, :x.shape[1], :]).requires_grad_(False)
         return self.dropout(x)
 # class LigandGNN(nn.Module): # GCN CONV
 #     def __init__(self, input_dim, hidden_channels):
 #         super().__init__()
@@ -70,8 +67,12 @@ class LigandGNN(nn.Module):
         # Heads=4 means we use 4 attention heads
         # Concat=False, we average the heads instead of concatenating them, to keep the output dimension same as hidden_channels
         self.conv1 = GATConv(input_dim, hidden_channels, heads=heads, concat=False)
-        self.conv2 = GATConv(hidden_channels, hidden_channels, heads=heads, concat=False)
-        self.conv3 = GATConv(hidden_channels, hidden_channels, heads=heads, concat=False)
         self.dropout = nn.Dropout(dropout)
     def forward(self, x, edge_index, batch):
@@ -89,20 +90,23 @@ class LigandGNN(nn.Module):
         x = global_mean_pool(x, batch)
         return x
 class ProteinTransformer(nn.Module):
     def __init__(self, vocab_size, d_model=128, N=2, h=4, output_dim=128, dropout=0.2):
         super().__init__()
         self.d_model = d_model
         self.embedding = nn.Embedding(vocab_size, d_model)
         self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)
-        encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=h, batch_first=True)
         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N)
         self.fc = nn.Linear(d_model, output_dim)
     def forward(self, x):
         # x: [batch_size, seq_len]
-        padding_mask = (x == 0) # mask for PAD tokens
         x = self.embedding(x) * math.sqrt(self.d_model)
         x = self.pos_encoder(x)
         x = self.transformer(x, src_key_padding_mask=padding_mask)
@@ -116,20 +120,34 @@ class ProteinTransformer(nn.Module):
         x = self.fc(x)
         return x
 class BindingAffinityModel(nn.Module):
-    def __init__(self, num_node_features, hidden_channels=128, gat_heads=4, dropout=0.2):
         super().__init__()
         # Tower 1 - Ligand GNN
-        self.ligand_gnn = LigandGNN(input_dim=num_node_features, hidden_channels=hidden_channels, heads=gat_heads, dropout=dropout)
         # Tower 2 - Protein Transformer
-        self.protein_transformer = ProteinTransformer(vocab_size=26, d_model=hidden_channels, output_dim=hidden_channels, dropout=dropout)
         self.head = nn.Sequential(
-            nn.Linear(hidden_channels*2, hidden_channels),
             nn.ReLU(),
             nn.Dropout(dropout),
             nn.Linear(hidden_channels, 1),
         )
     def forward(self, x, edge_index, batch, protein_seq):
         ligand_vec = self.ligand_gnn(x, edge_index, batch)
         batch_size = batch.max().item() + 1
@@ -138,5 +156,3 @@ class BindingAffinityModel(nn.Module):
         protein_vec = self.protein_transformer(protein_seq)
         combined = torch.cat([ligand_vec, protein_vec], dim=1)
         return self.head(combined)

 import math
 import torch
 import torch.nn as nn
 from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
 class PositionalEncoding(nn.Module):
     def __init__(self, d_model: int, seq_len: int = 5000, dropout: float = 0.1):
         super().__init__()
     def forward(self, x):
         # x: [batch_size, seq_len, d_model]
+        x = x + (self.pe[:, : x.shape[1], :]).requires_grad_(False)
         return self.dropout(x)
 # class LigandGNN(nn.Module): # GCN CONV
 #     def __init__(self, input_dim, hidden_channels):
 #         super().__init__()
         # Heads=4 means we use 4 attention heads
         # Concat=False, we average the heads instead of concatenating them, to keep the output dimension same as hidden_channels
         self.conv1 = GATConv(input_dim, hidden_channels, heads=heads, concat=False)
+        self.conv2 = GATConv(
+            hidden_channels, hidden_channels, heads=heads, concat=False
+        )
+        self.conv3 = GATConv(
+            hidden_channels, hidden_channels, heads=heads, concat=False
+        )
         self.dropout = nn.Dropout(dropout)
     def forward(self, x, edge_index, batch):
         x = global_mean_pool(x, batch)
         return x
 class ProteinTransformer(nn.Module):
     def __init__(self, vocab_size, d_model=128, N=2, h=4, output_dim=128, dropout=0.2):
         super().__init__()
         self.d_model = d_model
         self.embedding = nn.Embedding(vocab_size, d_model)
         self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=h, batch_first=True
+        )
         self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N)
         self.fc = nn.Linear(d_model, output_dim)
     def forward(self, x):
         # x: [batch_size, seq_len]
+        padding_mask = x == 0  # mask for PAD tokens
         x = self.embedding(x) * math.sqrt(self.d_model)
         x = self.pos_encoder(x)
         x = self.transformer(x, src_key_padding_mask=padding_mask)
         x = self.fc(x)
         return x
 class BindingAffinityModel(nn.Module):
+    def __init__(
+        self, num_node_features, hidden_channels=128, gat_heads=4, dropout=0.2
+    ):
         super().__init__()
         # Tower 1 - Ligand GNN
+        self.ligand_gnn = LigandGNN(
+            input_dim=num_node_features,
+            hidden_channels=hidden_channels,
+            heads=gat_heads,
+            dropout=dropout,
+        )
         # Tower 2 - Protein Transformer
+        self.protein_transformer = ProteinTransformer(
+            vocab_size=26,
+            d_model=hidden_channels,
+            output_dim=hidden_channels,
+            dropout=dropout,
+        )
         self.head = nn.Sequential(
+            nn.Linear(hidden_channels * 2, hidden_channels),
             nn.ReLU(),
             nn.Dropout(dropout),
             nn.Linear(hidden_channels, 1),
         )
     def forward(self, x, edge_index, batch, protein_seq):
         ligand_vec = self.ligand_gnn(x, edge_index, batch)
         batch_size = batch.max().item() + 1
         protein_vec = self.protein_transformer(protein_seq)
         combined = torch.cat([ligand_vec, protein_vec], dim=1)
         return self.head(combined)

model_attention.py ADDED Viewed

	@@ -0,0 +1,143 @@

+import torch
+import torch.nn as nn
+from torch_geometric.nn import GATConv
+from torch_geometric.utils import to_dense_batch
+import torch.nn.functional as F
+class CrossAttentionLayer(nn.Module):
+    def __init__(self, feature_dim, num_heads=4, dropout=0.1):
+        super().__init__()
+        # Main attention layer
+        # Feature dim is the dimension of the hidden features
+        self.attention = nn.MultiheadAttention(
+            feature_dim, num_heads, dropout=dropout, batch_first=True
+        )
+        # Normalization layer for stabilizing training
+        self.norm = nn.LayerNorm(feature_dim)
+        # Feedforward network for further processing, classical transformer style
+        self.ff = nn.Sequential(
+            nn.Linear(feature_dim, feature_dim * 4),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(feature_dim * 4, feature_dim),
+        )
+        self.norm_ff = nn.LayerNorm(feature_dim)
+        self.last_attention_weights = None
+    def forward(self, ligand_features, protein_features, key_padding_mask=None):
+        # ligand_features: [Batch, Atoms, Dim] - atoms
+        # protein_features: [Batch, Residues, Dim] - amino acids
+        # Cross attention:
+        # Query = Ligand (What we want to find out)
+        # Key, Value = Protein (Where we look for information)
+        # Result: "Ligand enriched with knowledge about proteins"
+        attention_output, attn_weights = self.attention(
+            query=ligand_features,
+            key=protein_features,
+            value=protein_features,
+            key_padding_mask=key_padding_mask,
+            need_weights=True,
+            average_attn_weights=True,
+        )
+        self.last_attention_weights = attn_weights.detach().cpu()
+        # Residual connection (x + attention(x)) and normalization
+        ligand_features = self.norm(ligand_features + attention_output)
+        # Feedforward network with residual connection and normalization
+        ff_output = self.ff(ligand_features)
+        ligand_features = self.norm_ff(ligand_features + ff_output)
+        return ligand_features
+class BindingAffinityModel(nn.Module):
+    def __init__(
+        self, num_node_features, hidden_channels=256, gat_heads=2, dropout=0.3
+    ):
+        super().__init__()
+        self.dropout = dropout
+        self.hidden_channels = hidden_channels
+        # Tower 1 - Ligand GNN with GAT layers, using 3 GAT layers, so that every atom can "see" up to 3 bonds away
+        self.gat1 = GATConv(
+            num_node_features, hidden_channels, heads=gat_heads, concat=False
+        )
+        self.gat2 = GATConv(
+            hidden_channels, hidden_channels, heads=gat_heads, concat=False
+        )
+        self.gat3 = GATConv(
+            hidden_channels, hidden_channels, heads=gat_heads, concat=False
+        )
+        # Tower 2 - Protein Transformer, 22 = 21 amino acids + 1 padding token PAD
+        self.protein_embedding = nn.Embedding(22, hidden_channels)
+        # Additional positional encoding (simple linear) to give the model information about the order
+        self.prot_conv = nn.Conv1d(
+            hidden_channels, hidden_channels, kernel_size=3, padding=1
+        )
+        # Cross-Attention Layer, atoms attending to amino acids
+        self.cross_attention = CrossAttentionLayer(
+            feature_dim=hidden_channels, num_heads=4, dropout=dropout
+        )
+        self.fc1 = nn.Linear(hidden_channels, hidden_channels)
+        self.fc2 = nn.Linear(hidden_channels, 1)  # Final output for regression, pKd
+    def forward(self, x, edge_index, batch, protein_seq):
+        # Ligand GNN forward pass (Graph -> Node Embeddings)
+        x = F.elu(self.gat1(x, edge_index))
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = F.elu(self.gat2(x, edge_index))
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = F.elu(self.gat3(x, edge_index))  # [Total_Atoms, Hidden_Channels]
+        # Convert graph into tensor [Batch, Max_Atoms, Hidden_Channels]
+        # to_dense_batch adds zeros paddings where necessary
+        ligand_dense, ligand_mask = to_dense_batch(x, batch)
+        # ligand_dense: [Batch, Max_Atoms, Hidden_Channels]
+        # ligand_mask: [Batch, Max_Atoms] True where there is real atom, False where there is padding
+        batch_size = ligand_dense.size(0)
+        protein_seq = protein_seq.view(batch_size, -1)  # [Batch, Seq_Len]
+        # Protein forward pass protein_seq: [Batch, Seq_Len]
+        p = self.protein_embedding(protein_seq)  # [Batch, Seq_Len, Hidden_Channels]
+        # A simple convolution to understand local context in amino acids
+        p = p.permute(0, 2, 1)  # Change to [Batch, Hidden_Channels, Seq_Len] for Conv1d
+        p = F.relu(self.prot_conv(p))
+        p = p.permute(0, 2, 1)  # [Batch, Seq, Hidden_Channels]
+        # Mask for protein (where PAD=0, True, but MHA needs True where IGNOREME)
+        # In Pytorch MHA, the key_padding_mask should be True where we want to ignore
+        protein_pad_mask = protein_seq == 0
+        # Cross-Attention
+        x_cross = self.cross_attention(
+            ligand_dense, p, key_padding_mask=protein_pad_mask
+        )
+        # Pooling over atoms to get a single vector per molecule, considering only real atoms, ignoring paddings
+        # ligand mask True where real atom, False where padding
+        mask_expanded = ligand_mask.unsqueeze(-1)  # [Batch, Max_Atoms, 1]
+        # Zero out the padded atom features
+        x_cross = x_cross * mask_expanded
+        # Sum the features of real atoms / number of real atoms to get the mean
+        sum_features = torch.sum(x_cross, dim=1)  # [Batch, Hidden_Channels]
+        num_atoms = torch.sum(mask_expanded, dim=1)  # [Batch, 1]
+        pooled_x = sum_features / (num_atoms + 1e-6)  # Avoid division by zero
+        # MLP Head
+        out = F.relu(self.fc1(pooled_x))
+        out = F.dropout(out, p=self.dropout, training=self.training)
+        out = self.fc2(out)
+        return out

model_pl.py CHANGED Viewed

@@ -8,19 +8,19 @@ from torch.optim import Adam
 from model import LigandGNN, ProteinTransformer
 class BindingAffinityModelPL(pl.LightningModule):
     def __init__(self, num_node_features, hidden_channels_gnn, lr):
         super().__init__()
-        self.save_hyperparameters() # Save hyperparameters for easy access
         self.lr = lr
-        self.ligand_gnn = LigandGNN(input_dim=num_node_features, hidden_channels=hidden_channels_gnn)
         self.protein_transformer = ProteinTransformer(vocab_size=26)
         self.head = nn.Sequential(
-            nn.Linear(128 + 128, 256),
-            nn.ReLU(),
-            nn.Dropout(0.2),
-            nn.Linear(256, 1)
         )
         self.criterion = nn.MSELoss()

 from model import LigandGNN, ProteinTransformer
 class BindingAffinityModelPL(pl.LightningModule):
     def __init__(self, num_node_features, hidden_channels_gnn, lr):
         super().__init__()
+        self.save_hyperparameters()  # Save hyperparameters for easy access
         self.lr = lr
+        self.ligand_gnn = LigandGNN(
+            input_dim=num_node_features, hidden_channels=hidden_channels_gnn
+        )
         self.protein_transformer = ProteinTransformer(vocab_size=26)
         self.head = nn.Sequential(
+            nn.Linear(128 + 128, 256), nn.ReLU(), nn.Dropout(0.2), nn.Linear(256, 1)
         )
         self.criterion = nn.MSELoss()

optuna_train.py CHANGED Viewed

@@ -9,10 +9,11 @@ from torch.utils.data import random_split
 from dataset import BindingDataset
 from model import BindingAffinityModel
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 N_TRIALS = 20
 EPOCHS_PER_TRIAL = 15
 def set_seed(seed=42):
     random.seed(seed)
     np.random.seed(seed)
@@ -20,7 +21,8 @@ def set_seed(seed=42):
     torch.cuda.manual_seed(seed)
     return torch.Generator().manual_seed(seed)
-dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
 dataframe.dropna(inplace=True)
 dataset = BindingDataset(dataframe)
@@ -28,9 +30,12 @@ gen = set_seed(42)
 train_size = int(0.8 * len(dataset))
 test_size = len(dataset) - train_size
-train_dataset, test_dataset = random_split(dataset, [train_size, test_size], generator=gen)
 num_features = train_dataset[0].x.shape[1]
 def train(model, loader, optimizer, criterion):
     model.train()
     for batch in loader:
@@ -62,28 +67,50 @@ def objective(trial):
     # Learning
-    lr = trial.suggest_float("lr", 1e-5, 1e-2, log=True) # Learning rate from 0.00001 to 0.01
-    weight_decay = trial.suggest_float("weight_decay", 1e-6, 1e-3, log=True) # Weight decay from 0.000001 to 0.001
     batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
-    model = BindingAffinityModel(num_node_features=num_features, hidden_channels=hidden_dim, gat_heads=gat_heads, dropout=dropout).to(DEVICE)
     optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
     criterion = nn.MSELoss()
     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
     for epoch in range(EPOCHS_PER_TRIAL):
         train(model, train_loader, optimizer, criterion)
         val_loss = test(model, test_loader, criterion)
-        print(f"Trial {trial.number} | Epoch {epoch + 1}/{EPOCHS_PER_TRIAL} | Val Loss: {val_loss:.4f}")
         trial.report(val_loss, epoch)
         if trial.should_prune():
             raise optuna.exceptions.TrialPruned()
-    return val_loss
 if __name__ == "__main__":
@@ -93,7 +120,7 @@ if __name__ == "__main__":
         pruner=optuna.pruners.MedianPruner(),
         storage=storage_name,
         study_name="binding_prediction_optimization",
-        load_if_exists=True
     )
     print("Start hyperparameter optimization...")

 from dataset import BindingDataset
 from model import BindingAffinityModel
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 N_TRIALS = 20
 EPOCHS_PER_TRIAL = 15
 def set_seed(seed=42):
     random.seed(seed)
     np.random.seed(seed)
     torch.cuda.manual_seed(seed)
     return torch.Generator().manual_seed(seed)
+dataframe = pd.read_csv("pdbbind_refined_dataset.csv")
 dataframe.dropna(inplace=True)
 dataset = BindingDataset(dataframe)
 train_size = int(0.8 * len(dataset))
 test_size = len(dataset) - train_size
+train_dataset, test_dataset = random_split(
+    dataset, [train_size, test_size], generator=gen
+)
 num_features = train_dataset[0].x.shape[1]
 def train(model, loader, optimizer, criterion):
     model.train()
     for batch in loader:
     # Learning
+    lr = trial.suggest_float(
+        "lr", 1e-5, 1e-2, log=True
+    )  # Learning rate from 0.00001 to 0.01
+    weight_decay = trial.suggest_float(
+        "weight_decay", 1e-6, 1e-3, log=True
+    )  # Weight decay from 0.000001 to 0.001
     batch_size = trial.suggest_categorical("batch_size", [16, 32, 64])
+    model = BindingAffinityModel(
+        num_node_features=num_features,
+        hidden_channels=hidden_dim,
+        gat_heads=gat_heads,
+        dropout=dropout,
+    ).to(DEVICE)
     optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
     criterion = nn.MSELoss()
     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    best_val_loss = float("inf")
     for epoch in range(EPOCHS_PER_TRIAL):
         train(model, train_loader, optimizer, criterion)
         val_loss = test(model, test_loader, criterion)
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+        scheduler.step(val_loss)
+        print(
+            f"Trial {trial.number} | Epoch {epoch + 1}/{EPOCHS_PER_TRIAL} | Val Loss: {val_loss:.4f}"
+        )
         trial.report(val_loss, epoch)
         if trial.should_prune():
             raise optuna.exceptions.TrialPruned()
+    return best_val_loss
 if __name__ == "__main__":
         pruner=optuna.pruners.MedianPruner(),
         storage=storage_name,
         study_name="binding_prediction_optimization",
+        load_if_exists=True,
     )
     print("Start hyperparameter optimization...")

optuna_train_attention.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import optuna
+import torch
+import torch.nn as nn
+import pandas as pd
+import random
+import numpy as np
+from torch_geometric.loader import DataLoader
+from torch.utils.data import random_split
+from dataset import BindingDataset
+from model_attention import BindingAffinityModel
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+N_TRIALS = 50
+MAX_EPOCHS_PER_TRIAL = 60
+LOG_DIR = "runs"
+DATA_CSV = "pdbbind_refined_dataset.csv"
+def set_seed(seed=42):
+    random.seed(seed)
+    np.random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    return torch.Generator().manual_seed(seed)
+dataframe = pd.read_csv(DATA_CSV)
+dataframe.dropna(inplace=True)
+dataset = BindingDataset(dataframe, max_seq_length=1200)
+gen = set_seed(42)
+train_size = int(0.8 * len(dataset))
+test_size = len(dataset) - train_size
+train_dataset, test_dataset = random_split(
+    dataset, [train_size, test_size], generator=gen
+)
+num_features = train_dataset[0].x.shape[1]
+def train(model, loader, optimizer, criterion):
+    model.train()
+    for batch in loader:
+        batch = batch.to(DEVICE)
+        optimizer.zero_grad()
+        out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+        loss = criterion(out.squeeze(), batch.y.squeeze())
+        loss.backward()
+        optimizer.step()
+def test(model, loader, criterion):
+    model.eval()
+    total_loss = 0
+    with torch.no_grad():
+        for batch in loader:
+            batch = batch.to(DEVICE)
+            out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+            loss = criterion(out.squeeze(), batch.y.squeeze())
+            total_loss += loss.item()
+    return total_loss / len(loader)
+def objective(trial):
+    # Architecture
+    hidden_dim = trial.suggest_categorical("hidden_dim", [128, 256])
+    gat_heads = trial.suggest_categorical("gat_heads", [2, 4])
+    dropout = trial.suggest_float("dropout", 0.2, 0.5)
+    # Learning
+    lr = trial.suggest_float(
+        "lr", 1e-5, 1e-3, log=True
+    )  # Learning rate from 0.00001 to 0.001
+    weight_decay = trial.suggest_float(
+        "weight_decay", 1e-6, 1e-3, log=True
+    )  # Weight decay from 0.000001 to 0.001
+    batch_size = 16
+    model = BindingAffinityModel(
+        num_node_features=num_features,
+        hidden_channels=hidden_dim,
+        gat_heads=gat_heads,
+        dropout=dropout,
+    ).to(DEVICE)
+    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
+    criterion = nn.MSELoss()
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
+    best_val_loss = float("inf")
+    for epoch in range(MAX_EPOCHS_PER_TRIAL):
+        train(model, train_loader, optimizer, criterion)
+        val_loss = test(model, test_loader, criterion)
+        scheduler.step(val_loss)
+        if val_loss < best_val_loss:
+            best_val_loss = val_loss
+        print(
+            f"Trial {trial.number} | Epoch {epoch + 1}/{MAX_EPOCHS_PER_TRIAL} | Val Loss: {val_loss:.4f}"
+        )
+        trial.report(val_loss, epoch)
+        if trial.should_prune():
+            raise optuna.exceptions.TrialPruned()
+    return best_val_loss
+if __name__ == "__main__":
+    storage_name = "sqlite:///db.sqlite3"
+    study = optuna.create_study(
+        direction="minimize",
+        pruner=optuna.pruners.MedianPruner(n_min_trials=5, n_warmup_steps=10),
+        storage=storage_name,
+        study_name="binding_prediction_optimization_attentionV2",
+        load_if_exists=True,
+    )
+    print("Start hyperparameter optimization...")
+    study.optimize(objective, n_trials=N_TRIALS)
+    print("\n--- Optimization Finished ---")
+    print("Best parameters found: ", study.best_params)
+    print("Best Test MSE: ", study.best_value)
+    df_results = study.trials_dataframe()
+    df_results.to_csv("optuna_results_attention.csv")

templates/index.html ADDED Viewed

	@@ -0,0 +1,103 @@

+<!DOCTYPE html>
+<html lang="ru">
+<head>
+    <meta charset="UTF-8">
+    <meta name="viewport" content="width=device-width, initial-scale=1.0">
+    <title>BioBinding AI Vis</title>
+    <link href="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/css/bootstrap.min.css" rel="stylesheet">
+    <style>
+        body { background-color: #f4f6f9; font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; }
+        .sidebar { background: white; border-right: 1px solid #dee2e6; height: 100vh; overflow-y: auto; }
+        .result-card { border: none; box-shadow: 0 4px 6px rgba(0,0,0,0.05); border-radius: 12px; margin-bottom: 20px; }
+        .affinity-score { font-size: 3rem; font-weight: bold; color: #4e73df; }
+        .atom-badge { font-size: 0.9rem; padding: 8px 12px; }
+        .mol-container { width: 100%; height: 600px; border-radius: 12px; overflow: hidden; border: 1px solid #ddd; }
+        iframe { width: 100%; height: 100%; border: none; }
+    </style>
+</head>
+<body>
+<div class="container-fluid">
+    <div class="row">
+        <div class="col-md-3 sidebar p-4">
+            <h3 class="mb-4 text-primary">🧪 BioBind AI</h3>
+            <form action="/predict" method="post">
+                <div class="mb-3">
+                    <label class="form-label fw-bold">Ligand (SMILES)</label>
+                    <textarea class="form-control" name="smiles_ligand" rows="3" required>{{ smiles_ligand if smiles_ligand else 'COc1ccc(S(=O)(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc2ccccc2)NC(=O)O[C@@H]2C[C@@H]3NC(=O)O[C@@H]3C2)cc1' }}</textarea>
+                </div>
+                <div class="mb-3">
+                    <label class="form-label fw-bold">Protein Sequence</label>
+                    <textarea class="form-control" name="sequence_protein" rows="3" required>{{ sequence_protein if sequence_protein else 'PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKMIGGIGGFIKVRQYDQIIIEIAGHKAIGTVLVGPTPVNIIGRNLLTQIGATLNF' }}</textarea>
+                </div>
+                <button type="submit" class="btn btn-primary w-100 py-2">🔮 Calculate Binding</button>
+            </form>
+            {% if result_ready %}
+            <hr class="my-4">
+            <h5 class="mb-3">Top Important Atoms</h5>
+            <div class="list-group">
+                {% for atom in atom_list %}
+                <div class="list-group-item d-flex justify-content-between align-items-center">
+                    <span>
+                        <span class="fw-bold">#{{ atom.id }}</span> {{ atom.symbol }}
+                    </span>
+                    <span>
+                        <span class="badge bg-light text-dark border">{{ atom.score }}</span>
+                        <span>{{ atom.icon }}</span>
+                    </span>
+                </div>
+                {% endfor %}
+            </div>
+            {% endif %}
+        </div>
+        <div class="col-md-9 p-4">
+            {% if result_ready %}
+                <div class="card result-card p-4 text-center">
+                    <h2 class="text-muted">Predicted Binding Affinity (pKd)</h2>
+                    <div class="affinity-score">{{ affinity }}</div>
+                </div>
+                <div class="card result-card p-3">
+                    <ul class="nav nav-pills mb-3" id="pills-tab" role="tablist">
+                        <li class="nav-item" role="presentation">
+                            <button class="nav-link active" id="pills-py3dmol-tab" data-bs-toggle="pill" data-bs-target="#pills-py3dmol" type="button">🧬 Py3Dmol (High Contrast)</button>
+                        </li>
+                        <li class="nav-item" role="presentation">
+                            <button class="nav-link" id="pills-ngl-tab" data-bs-toggle="pill" data-bs-target="#pills-ngl" type="button">🔬 NGLView</button>
+                        </li>
+                    </ul>
+                    <div class="tab-content" id="pills-tabContent">
+                        <div class="tab-pane fade show active" id="pills-py3dmol" role="tabpanel">
+                            <div class="mol-container">
+                                <iframe srcdoc="{{ html_py3dmol }}" style="width: 100%; height: 100%; border: none;"></iframe>
+                            </div>
+                        </div>
+                        <div class="tab-pane fade" id="pills-ngl" role="tabpanel">
+                            <div class="mol-container">
+                                <iframe src="{{ url_ngl }}" style="width: 100%; height: 100%; border: none;"></iframe>
+                            </div>
+                        </div>
+                    </div>
+                </div>
+            {% else %}
+                <div class="d-flex align-items-center justify-content-center h-100 text-muted">
+                    <div class="text-center">
+                        <h1>🧬 Ready to Analyze</h1>
+                        <p>Enter SMILES and Protein sequence on the left to start.</p>
+                    </div>
+                </div>
+            {% endif %}
+        </div>
+    </div>
+</div>
+<script src="https://cdn.jsdelivr.net/npm/bootstrap@5.3.0/dist/js/bootstrap.bundle.min.js"></script>
+</body>
+</html>

train.py CHANGED Viewed

@@ -20,12 +20,14 @@ WEIGHT_DECAY = 7.06e-6
 EPOCS = 100
 DROPOUT = 0.325
 GAT_HEADS = 2
-DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-LOG_DIR = f"runs/experiment_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 TOP_K = 3
 SAVES_DIR = LOG_DIR + "/models"
 def set_seed(seed=42):
     random.seed(seed)
     torch.manual_seed(seed)
@@ -52,13 +54,14 @@ def train_epoch(epoch, model, loader, optimizer, criterion, writer):
         total_loss += current_loss
         global_step = (epoch - 1) * len(loader) + i
-        writer.add_scalar('Loss/Train_Step', current_loss, global_step)
-        loop.set_postfix(loss = loss.item())
     avg_loss = total_loss / len(loader)
     return avg_loss
 def evaluate(epoch, model, loader, criterion, writer):
     model.eval()
     total_loss = 0
@@ -70,9 +73,10 @@ def evaluate(epoch, model, loader, criterion, writer):
             total_loss += loss.item()
     avg_loss = total_loss / len(loader)
-    writer.add_scalar('Loss/Test', avg_loss, epoch)
     return avg_loss
 def main():
     gen = set_seed(42)
     writer = SummaryWriter(LOG_DIR)
@@ -82,20 +86,21 @@ def main():
     print(f"Logging to {LOG_DIR}...")
     print(f"Model saves to {SAVES_DIR}...")
     # Load dataset
-    dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
     dataframe.dropna(inplace=True)
     print("Dataset loaded with {} samples".format(len(dataframe)))
-    dataset = BindingDataset(dataframe)
     print("Dataset transformed with {} samples".format(len(dataset)))
     if len(dataset) == 0:
         print("Dataset is empty")
         return
     train_size = int(0.8 * len(dataset))
     test_size = len(dataset) - train_size
-    train_dataset, test_dataset = random_split(dataset, [train_size, test_size], generator=gen)
     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
@@ -104,40 +109,57 @@ def main():
     model = BindingAffinityModel(
         num_node_features=num_features,
-        hidden_channels=256,
         gat_heads=GAT_HEADS,
-        dropout=DROPOUT
     ).to(DEVICE)
     optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
     criterion = nn.MSELoss()
     top_models = []
     print(f"Starting training on {DEVICE}")
     for epoch in range(1, EPOCS + 1):
-        train_loss = train_epoch(epoch, model, train_loader, optimizer, criterion, writer)
         test_loss = evaluate(epoch, model, test_loader, criterion, writer)
-        print(f'Epoch {epoch:02d}, Train Loss: {train_loss:.4f}, Test Loss: {test_loss:.4f}')
         filename = f"{SAVES_DIR}/model_ep{epoch:03d}_mse{test_loss:.4f}.pth"
         torch.save(model.state_dict(), filename)
-        top_models.append({'loss': test_loss, 'path': filename, 'epoch': epoch})
-        top_models.sort(key=lambda x: x['loss'])
         if len(top_models) > TOP_K:
             worst_model = top_models.pop()
-            os.remove(worst_model['path'])
-        if any(m['epoch'] == epoch for m in top_models):
-            rank = [m['epoch'] for m in top_models].index(epoch) + 1
-            print(f'-- Model saved (Rank: {rank})')
         else:
             print("")
     writer.close()
     print("Training finished.")
     print("Top models saved:")
@@ -146,4 +168,4 @@ def main():
 if __name__ == "__main__":
-    main()

 EPOCS = 100
 DROPOUT = 0.325
 GAT_HEADS = 2
+HIDDEN_CHANNELS = 256
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+LOG_DIR = f"runs/experiment_scheduler{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 TOP_K = 3
 SAVES_DIR = LOG_DIR + "/models"
 def set_seed(seed=42):
     random.seed(seed)
     torch.manual_seed(seed)
         total_loss += current_loss
         global_step = (epoch - 1) * len(loader) + i
+        writer.add_scalar("Loss/Train_Step", current_loss, global_step)
+        loop.set_postfix(loss=loss.item())
     avg_loss = total_loss / len(loader)
     return avg_loss
 def evaluate(epoch, model, loader, criterion, writer):
     model.eval()
     total_loss = 0
             total_loss += loss.item()
     avg_loss = total_loss / len(loader)
+    writer.add_scalar("Loss/Test", avg_loss, epoch)
     return avg_loss
 def main():
     gen = set_seed(42)
     writer = SummaryWriter(LOG_DIR)
     print(f"Logging to {LOG_DIR}...")
     print(f"Model saves to {SAVES_DIR}...")
     # Load dataset
+    dataframe = pd.read_csv("pdbbind_refined_dataset.csv")
     dataframe.dropna(inplace=True)
     print("Dataset loaded with {} samples".format(len(dataframe)))
+    dataset = BindingDataset(dataframe, max_seq_length=1200)
     print("Dataset transformed with {} samples".format(len(dataset)))
     if len(dataset) == 0:
         print("Dataset is empty")
         return
     train_size = int(0.8 * len(dataset))
     test_size = len(dataset) - train_size
+    train_dataset, test_dataset = random_split(
+        dataset, [train_size, test_size], generator=gen
+    )
     train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
     model = BindingAffinityModel(
         num_node_features=num_features,
+        hidden_channels=HIDDEN_CHANNELS,
         gat_heads=GAT_HEADS,
+        dropout=DROPOUT,
     ).to(DEVICE)
     optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
+    # factor of 0.5 means reducing lr to half when triggered
+    # patience of 5 means wait for 5 epochs before reducing lr
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=5
+    )
     criterion = nn.MSELoss()
     top_models = []
     print(f"Starting training on {DEVICE}")
     for epoch in range(1, EPOCS + 1):
+        train_loss = train_epoch(
+            epoch, model, train_loader, optimizer, criterion, writer
+        )
         test_loss = evaluate(epoch, model, test_loader, criterion, writer)
+        old_lr = optimizer.param_groups[0]["lr"]
+        scheduler.step(test_loss)
+        new_lr = optimizer.param_groups[0]["lr"]
+        if new_lr != old_lr:
+            print(
+                f"\nEpoch {epoch}: Scheduler reduced LR from {old_lr:.6f} to {new_lr:.6f}!"
+            )
+        print(
+            f"Epoch {epoch:02d} | LR: {new_lr:.6f} | Train: {train_loss:.4f} | Test: {test_loss:.4f}",
+            end="",
+        )
         filename = f"{SAVES_DIR}/model_ep{epoch:03d}_mse{test_loss:.4f}.pth"
         torch.save(model.state_dict(), filename)
+        top_models.append({"loss": test_loss, "path": filename, "epoch": epoch})
+        top_models.sort(key=lambda x: x["loss"])
         if len(top_models) > TOP_K:
             worst_model = top_models.pop()
+            os.remove(worst_model["path"])
+        if any(m["epoch"] == epoch for m in top_models):
+            rank = [m["epoch"] for m in top_models].index(epoch) + 1
+            print(f"-- Model saved (Rank: {rank})")
         else:
             print("")
     writer.close()
     print("Training finished.")
     print("Top models saved:")
 if __name__ == "__main__":
+    main()

train_attention.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import random
+import torch
+import torch.nn as nn
+import pandas as pd
+from torch.utils.data import random_split
+from torch_geometric.loader import DataLoader
+from dataset import BindingDataset
+from model_attention import BindingAffinityModel
+from tqdm import tqdm
+from torch.utils.tensorboard import SummaryWriter
+import numpy as np
+from datetime import datetime
+import os
+# 2.02
+# BATCH_SIZE = 16
+# LR = 0.00035  # Reduced learning rate
+# WEIGHT_DECAY = 1e-5  # Slightly increased weight decay (regularization)
+# EPOCHS = 100
+# DROPOUT = 0.3  # Slightly reduced dropout
+# GAT_HEADS = 2
+# HIDDEN_CHANNELS = 256
+# 1.90 from Optuna
+BATCH_SIZE = 16
+LR = 0.000034
+WEIGHT_DECAY = 1e-6
+DROPOUT = 0.26
+EPOCHS = 100
+HIDDEN_CHANNELS = 256
+GAT_HEADS = 2
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+LOG_DIR = f"runs/experiment_attention{datetime.now().strftime('%Y%m%d_%H%M%S')}"
+TOP_K = 3
+SAVES_DIR = LOG_DIR + "/models"
+def set_seed(seed=42):
+    random.seed(seed)
+    torch.manual_seed(seed)
+    torch.cuda.manual_seed(seed)
+    np.random.seed(seed)
+    return torch.Generator().manual_seed(seed)
+def train_epoch(epoch, model, loader, optimizer, criterion, writer):
+    model.train()
+    total_loss = 0
+    loop = tqdm(loader, desc=f"Training epoch: {epoch}", leave=False)
+    for i, batch in enumerate(loop):
+        batch = batch.to(DEVICE)
+        optimizer.zero_grad()
+        out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+        loss = criterion(out.squeeze(), batch.y.squeeze())
+        loss.backward()
+        optimizer.step()
+        current_loss = loss.item()
+        total_loss += current_loss
+        global_step = (epoch - 1) * len(loader) + i
+        writer.add_scalar("Loss/Train_Step", current_loss, global_step)
+        loop.set_postfix(loss=loss.item())
+    avg_loss = total_loss / len(loader)
+    return avg_loss
+def evaluate(epoch, model, loader, criterion, writer):
+    model.eval()
+    total_loss = 0
+    with torch.no_grad():
+        for batch in tqdm(loader, desc=f"Evaluating epoch: {epoch}", leave=False):
+            batch = batch.to(DEVICE)
+            out = model(batch.x, batch.edge_index, batch.batch, batch.protein_seq)
+            loss = criterion(out.squeeze(), batch.y.squeeze())
+            total_loss += loss.item()
+    avg_loss = total_loss / len(loader)
+    writer.add_scalar("Loss/Test", avg_loss, epoch)
+    return avg_loss
+def main():
+    gen = set_seed(42)
+    writer = SummaryWriter(LOG_DIR)
+    if not os.path.exists(SAVES_DIR):
+        os.makedirs(SAVES_DIR)
+    print(f"Logging to {LOG_DIR}...")
+    print(f"Model saves to {SAVES_DIR}...")
+    # Load dataset
+    dataframe = pd.read_csv("pdbbind_refined_dataset.csv")
+    dataframe.dropna(inplace=True)
+    print("Dataset loaded with {} samples".format(len(dataframe)))
+    dataset = BindingDataset(dataframe, max_seq_length=1200)
+    print("Dataset transformed with {} samples".format(len(dataset)))
+    if len(dataset) == 0:
+        print("Dataset is empty")
+        return
+    train_size = int(0.8 * len(dataset))
+    test_size = len(dataset) - train_size
+    train_dataset, test_dataset = random_split(
+        dataset, [train_size, test_size], generator=gen
+    )
+    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)
+    num_features = train_dataset[0].x.shape[1]
+    print("Number of node features:", num_features)
+    model = BindingAffinityModel(
+        num_node_features=num_features,
+        hidden_channels=HIDDEN_CHANNELS,
+        gat_heads=GAT_HEADS,
+        dropout=DROPOUT,
+    ).to(DEVICE)
+    optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=WEIGHT_DECAY)
+    # factor of 0.5 means reducing lr to half when triggered
+    # patience of 8 means wait for 8 epochs before reducing lr
+    scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
+        optimizer, mode="min", factor=0.5, patience=8
+    )
+    criterion = nn.MSELoss()
+    top_models = []
+    print(f"Starting training on {DEVICE}")
+    for epoch in range(1, EPOCHS + 1):
+        train_loss = train_epoch(
+            epoch, model, train_loader, optimizer, criterion, writer
+        )
+        test_loss = evaluate(epoch, model, test_loader, criterion, writer)
+        old_lr = optimizer.param_groups[0]["lr"]
+        scheduler.step(test_loss)
+        new_lr = optimizer.param_groups[0]["lr"]
+        if new_lr != old_lr:
+            print(
+                f"\nEpoch {epoch}: Scheduler reduced LR from {old_lr:.6f} to {new_lr:.6f}!"
+            )
+        print(
+            f"Epoch {epoch:02d} | LR: {new_lr:.6f} | Train: {train_loss:.4f} | Test: {test_loss:.4f}",
+            end="",
+        )
+        filename = f"{SAVES_DIR}/model_ep{epoch:03d}_mse{test_loss:.4f}.pth"
+        torch.save(model.state_dict(), filename)
+        top_models.append({"loss": test_loss, "path": filename, "epoch": epoch})
+        top_models.sort(key=lambda x: x["loss"])
+        if len(top_models) > TOP_K:
+            worst_model = top_models.pop()
+            os.remove(worst_model["path"])
+        if any(m["epoch"] == epoch for m in top_models):
+            rank = [m["epoch"] for m in top_models].index(epoch) + 1
+            print(f"-- Model saved (Rank: {rank})")
+        else:
+            print("")
+    writer.close()
+    print("Training finished.")
+    print("Top models saved:")
+    for i, m in enumerate(top_models):
+        print(f"{i + 1}. {m['path']} (MSE: {m['loss']:.4f})")
+if __name__ == "__main__":
+    main()

train_pl.py CHANGED Viewed

@@ -6,10 +6,11 @@ from torch.utils.data import random_split
 from model_pl import BindingAffinityModelPL
 import pandas as pd
 def main():
     lr = 0.0005
     # Load dataset
-    dataframe = pd.read_csv('pdbbind_refined_dataset.csv')
     dataframe.dropna(inplace=True)
     print("Dataset loaded with {} samples".format(len(dataframe)))
     dataset = BindingDataset(dataframe)
@@ -30,21 +31,22 @@ def main():
     model = BindingAffinityModelPL(num_node_features=84, hidden_channels_gnn=128, lr=lr)
     checkpoint_callback = ModelCheckpoint(
-        monitor='val_loss',
-        dirpath='checkpoints/',
-        filename='best-checkpoint',
         save_top_k=3,
-        mode='min'
     )
     early_stop_callback = EarlyStopping(monitor="val_loss", patience=5)
     trainer = pl.Trainer(
         max_epochs=20,
-        accelerator="auto", # Use GPU if available
         devices=1,
-        callbacks=[checkpoint_callback, early_stop_callback]
     )
     trainer.fit(model, train_loader, val_loader)
 if __name__ == "__main__":
-    main()

 from model_pl import BindingAffinityModelPL
 import pandas as pd
 def main():
     lr = 0.0005
     # Load dataset
+    dataframe = pd.read_csv("pdbbind_refined_dataset.csv")
     dataframe.dropna(inplace=True)
     print("Dataset loaded with {} samples".format(len(dataframe)))
     dataset = BindingDataset(dataframe)
     model = BindingAffinityModelPL(num_node_features=84, hidden_channels_gnn=128, lr=lr)
     checkpoint_callback = ModelCheckpoint(
+        monitor="val_loss",
+        dirpath="checkpoints/",
+        filename="best-checkpoint",
         save_top_k=3,
+        mode="min",
     )
     early_stop_callback = EarlyStopping(monitor="val_loss", patience=5)
     trainer = pl.Trainer(
         max_epochs=20,
+        accelerator="auto",  # Use GPU if available
         devices=1,
+        callbacks=[checkpoint_callback, early_stop_callback],
     )
     trainer.fit(model, train_loader, val_loader)
 if __name__ == "__main__":
+    main()

transformer_from_scratch/attention_visual.ipynb CHANGED Viewed

@@ -21,6 +21,7 @@
     "import pandas as pd\n",
     "import numpy as np\n",
     "import warnings\n",
     "warnings.filterwarnings(\"ignore\")"
    ]
   },
@@ -72,12 +73,14 @@
    "source": [
     "config = get_config()\n",
     "train_dataloader, val_dataloader, vocab_src, vocab_tgt = get_ds(config)\n",
-    "model = get_model(config, vocab_src.get_vocab_size(), vocab_tgt.get_vocab_size()).to(device)\n",
     "\n",
     "# Load the pretrained weights\n",
     "model_filename = get_weights_file_path(config, f\"34\")\n",
     "state = torch.load(model_filename)\n",
-    "model.load_state_dict(state['model_state_dict'])"
    ]
   },
   {
@@ -95,16 +98,26 @@
     "    decoder_input = batch[\"decoder_input\"].to(device)\n",
     "    decoder_mask = batch[\"decoder_mask\"].to(device)\n",
     "\n",
-    "    encoder_input_tokens = [vocab_src.id_to_token(idx) for idx in encoder_input[0].cpu().numpy()]\n",
-    "    decoder_input_tokens = [vocab_tgt.id_to_token(idx) for idx in decoder_input[0].cpu().numpy()]\n",
     "\n",
     "    # check that the batch size is 1\n",
-    "    assert encoder_input.size(\n",
-    "        0) == 1, \"Batch size must be 1 for validation\"\n",
     "\n",
     "    model_out = greedy_decode(\n",
-    "        model, encoder_input, encoder_mask, vocab_src, vocab_tgt, config['seq_len'], device)\n",
-    "    \n",
     "    return batch, encoder_input_tokens, decoder_input_tokens"
    ]
   },
@@ -132,6 +145,7 @@
     "        columns=[\"row\", \"column\", \"value\", \"row_token\", \"col_token\"],\n",
     "    )\n",
     "\n",
     "def get_attn_map(attn_type: str, layer: int, head: int):\n",
     "    if attn_type == \"encoder\":\n",
     "        attn = model.encoder.layers[layer].self_attention_block.attention_scores\n",
@@ -141,6 +155,7 @@
     "        attn = model.decoder.layers[layer].cross_attention_block.attention_scores\n",
     "    return attn[0, head].data\n",
     "\n",
     "def attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len):\n",
     "    df = mtx2df(\n",
     "        get_attn_map(attn_type, layer, head),\n",
@@ -158,17 +173,29 @@
     "            color=\"value\",\n",
     "            tooltip=[\"row\", \"column\", \"value\", \"row_token\", \"col_token\"],\n",
     "        )\n",
-    "        #.title(f\"Layer {layer} Head {head}\")\n",
     "        .properties(height=400, width=400, title=f\"Layer {layer} Head {head}\")\n",
     "        .interactive()\n",
     "    )\n",
     "\n",
-    "def get_all_attention_maps(attn_type: str, layers: list[int], heads: list[int], row_tokens: list, col_tokens, max_sentence_len: int):\n",
     "    charts = []\n",
     "    for layer in layers:\n",
     "        rowCharts = []\n",
     "        for head in heads:\n",
-    "            rowCharts.append(attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len))\n",
     "        charts.append(alt.hconcat(*rowCharts))\n",
     "    return alt.vconcat(*charts)"
    ]
@@ -287,7 +314,14 @@
     "heads = [0, 1, 2, 3, 4, 5, 6, 7]\n",
     "\n",
     "# Encoder Self-Attention\n",
-    "get_all_attention_maps(\"encoder\", layers, heads, encoder_input_tokens, encoder_input_tokens, min(20, sentence_len))"
    ]
   },
   {
@@ -379,7 +413,14 @@
    ],
    "source": [
     "# Encoder Self-Attention\n",
-    "get_all_attention_maps(\"decoder\", layers, heads, decoder_input_tokens, decoder_input_tokens, min(20, sentence_len))"
    ]
   },
   {
@@ -471,7 +512,14 @@
    ],
    "source": [
     "# Encoder Self-Attention\n",
-    "get_all_attention_maps(\"encoder-decoder\", layers, heads, encoder_input_tokens, decoder_input_tokens, min(20, sentence_len))"
    ]
   },
   {

     "import pandas as pd\n",
     "import numpy as np\n",
     "import warnings\n",
+    "\n",
     "warnings.filterwarnings(\"ignore\")"
    ]
   },
    "source": [
     "config = get_config()\n",
     "train_dataloader, val_dataloader, vocab_src, vocab_tgt = get_ds(config)\n",
+    "model = get_model(config, vocab_src.get_vocab_size(), vocab_tgt.get_vocab_size()).to(\n",
+    "    device\n",
+    ")\n",
     "\n",
     "# Load the pretrained weights\n",
     "model_filename = get_weights_file_path(config, f\"34\")\n",
     "state = torch.load(model_filename)\n",
+    "model.load_state_dict(state[\"model_state_dict\"])"
    ]
   },
   {
     "    decoder_input = batch[\"decoder_input\"].to(device)\n",
     "    decoder_mask = batch[\"decoder_mask\"].to(device)\n",
     "\n",
+    "    encoder_input_tokens = [\n",
+    "        vocab_src.id_to_token(idx) for idx in encoder_input[0].cpu().numpy()\n",
+    "    ]\n",
+    "    decoder_input_tokens = [\n",
+    "        vocab_tgt.id_to_token(idx) for idx in decoder_input[0].cpu().numpy()\n",
+    "    ]\n",
     "\n",
     "    # check that the batch size is 1\n",
+    "    assert encoder_input.size(0) == 1, \"Batch size must be 1 for validation\"\n",
     "\n",
     "    model_out = greedy_decode(\n",
+    "        model,\n",
+    "        encoder_input,\n",
+    "        encoder_mask,\n",
+    "        vocab_src,\n",
+    "        vocab_tgt,\n",
+    "        config[\"seq_len\"],\n",
+    "        device,\n",
+    "    )\n",
+    "\n",
     "    return batch, encoder_input_tokens, decoder_input_tokens"
    ]
   },
     "        columns=[\"row\", \"column\", \"value\", \"row_token\", \"col_token\"],\n",
     "    )\n",
     "\n",
+    "\n",
     "def get_attn_map(attn_type: str, layer: int, head: int):\n",
     "    if attn_type == \"encoder\":\n",
     "        attn = model.encoder.layers[layer].self_attention_block.attention_scores\n",
     "        attn = model.decoder.layers[layer].cross_attention_block.attention_scores\n",
     "    return attn[0, head].data\n",
     "\n",
+    "\n",
     "def attn_map(attn_type, layer, head, row_tokens, col_tokens, max_sentence_len):\n",
     "    df = mtx2df(\n",
     "        get_attn_map(attn_type, layer, head),\n",
     "            color=\"value\",\n",
     "            tooltip=[\"row\", \"column\", \"value\", \"row_token\", \"col_token\"],\n",
     "        )\n",
+    "        # .title(f\"Layer {layer} Head {head}\")\n",
     "        .properties(height=400, width=400, title=f\"Layer {layer} Head {head}\")\n",
     "        .interactive()\n",
     "    )\n",
     "\n",
+    "\n",
+    "def get_all_attention_maps(\n",
+    "    attn_type: str,\n",
+    "    layers: list[int],\n",
+    "    heads: list[int],\n",
+    "    row_tokens: list,\n",
+    "    col_tokens,\n",
+    "    max_sentence_len: int,\n",
+    "):\n",
     "    charts = []\n",
     "    for layer in layers:\n",
     "        rowCharts = []\n",
     "        for head in heads:\n",
+    "            rowCharts.append(\n",
+    "                attn_map(\n",
+    "                    attn_type, layer, head, row_tokens, col_tokens, max_sentence_len\n",
+    "                )\n",
+    "            )\n",
     "        charts.append(alt.hconcat(*rowCharts))\n",
     "    return alt.vconcat(*charts)"
    ]
     "heads = [0, 1, 2, 3, 4, 5, 6, 7]\n",
     "\n",
     "# Encoder Self-Attention\n",
+    "get_all_attention_maps(\n",
+    "    \"encoder\",\n",
+    "    layers,\n",
+    "    heads,\n",
+    "    encoder_input_tokens,\n",
+    "    encoder_input_tokens,\n",
+    "    min(20, sentence_len),\n",
+    ")"
    ]
   },
   {
    ],
    "source": [
     "# Encoder Self-Attention\n",
+    "get_all_attention_maps(\n",
+    "    \"decoder\",\n",
+    "    layers,\n",
+    "    heads,\n",
+    "    decoder_input_tokens,\n",
+    "    decoder_input_tokens,\n",
+    "    min(20, sentence_len),\n",
+    ")"
    ]
   },
   {
    ],
    "source": [
     "# Encoder Self-Attention\n",
+    "get_all_attention_maps(\n",
+    "    \"encoder-decoder\",\n",
+    "    layers,\n",
+    "    heads,\n",
+    "    encoder_input_tokens,\n",
+    "    decoder_input_tokens,\n",
+    "    min(20, sentence_len),\n",
+    ")"
    ]
   },
   {

transformer_from_scratch/config.py CHANGED Viewed

@@ -14,14 +14,15 @@ def get_config():
         "model_basename": "tmodel_",
         "preload": None,
         "tokenizer_file": "tokenizer_{0}.json",
-        "experiment_name": "runs/tmodel"
     }
 def get_weights_file_path(config, epoch):
     model_folder = config["model_folder"]
     model_basename = config["model_basename"]
     model_filename = f"{model_basename}{epoch}.pt"
-    return str(Path('.') / model_folder / model_filename)
 def latest_weights_file_path(config):
@@ -31,4 +32,4 @@ def latest_weights_file_path(config):
     if len(weights_files) == 0:
         return None
     weights_files.sort()
-    return str(weights_files[-1])

         "model_basename": "tmodel_",
         "preload": None,
         "tokenizer_file": "tokenizer_{0}.json",
+        "experiment_name": "runs/tmodel",
     }
 def get_weights_file_path(config, epoch):
     model_folder = config["model_folder"]
     model_basename = config["model_basename"]
     model_filename = f"{model_basename}{epoch}.pt"
+    return str(Path(".") / model_folder / model_filename)
 def latest_weights_file_path(config):
     if len(weights_files) == 0:
         return None
     weights_files.sort()
+    return str(weights_files[-1])

transformer_from_scratch/dataset.py CHANGED Viewed

@@ -13,26 +13,34 @@ class BilingualDataset(Dataset):
         self.src_lang = src_lang
         self.tgt_lang = tgt_lang
-        self.sos_token = torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64)
-        self.eos_token = torch.tensor([tokenizer_src.token_to_id('[EOS]')], dtype=torch.int64)
-        self.pad_token = torch.tensor([tokenizer_src.token_to_id('[PAD]')], dtype=torch.int64)
     def __len__(self):
         return len(self.ds)
     def __getitem__(self, index):
         src_target_pair = self.ds[index]
-        src_text = src_target_pair['translation'][self.src_lang]
-        tgt_text = src_target_pair['translation'][self.tgt_lang]
         enc_input_tokens = self.tokenizer_src.encode(src_text).ids
         dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
-        enc_num_padding_tokens = self.seq_len - len(enc_input_tokens) - 2  # for SOS and EOS
-        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1 # for SOS
         if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
-            raise ValueError('Sentence is too long')
         # Add SOS and EOS tokens to source text
         encoder_input = torch.cat(
@@ -40,7 +48,9 @@ class BilingualDataset(Dataset):
                 self.sos_token,
                 torch.tensor(enc_input_tokens, dtype=torch.int64),
                 self.eos_token,
-                torch.tensor([self.pad_token] * enc_num_padding_tokens, dtype=torch.int64)
             ]
         )
         # Add SOS token to the decoder input
@@ -48,7 +58,9 @@ class BilingualDataset(Dataset):
             [
                 self.sos_token,
                 torch.tensor(dec_input_tokens, dtype=torch.int64),
-                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
             ]
         )
         # Add EOS token to the label (what we want )
@@ -56,7 +68,9 @@ class BilingualDataset(Dataset):
             [
                 torch.tensor(dec_input_tokens, dtype=torch.int64),
                 self.eos_token,
-                torch.tensor([self.pad_token] * dec_num_padding_tokens, dtype=torch.int64)
             ]
         )
@@ -65,15 +79,27 @@ class BilingualDataset(Dataset):
         assert label.size(0) == self.seq_len
         return {
-            "encoder_input": encoder_input, # (Seq_len)
-            "decoder_input": decoder_input, # (Seq_len)
-            "encoder_mask": (encoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int(), # (1, 1, Seq_len)
-            "decoder_mask": (decoder_input != self.pad_token).unsqueeze(0).unsqueeze(0).int() & casual_mask(decoder_input.size(0)), # (1, Seq_len) & (1, Seq_len, Seq_len)
-            "label": label,                 # (Seq_len)
             "src_text": src_text,
-            "tgt_text": tgt_text
         }
 def casual_mask(size):
-    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(torch.int) # Upper triangular matrix, above the main diagonal
-    return mask==0

         self.src_lang = src_lang
         self.tgt_lang = tgt_lang
+        self.sos_token = torch.tensor(
+            [tokenizer_src.token_to_id("[SOS]")], dtype=torch.int64
+        )
+        self.eos_token = torch.tensor(
+            [tokenizer_src.token_to_id("[EOS]")], dtype=torch.int64
+        )
+        self.pad_token = torch.tensor(
+            [tokenizer_src.token_to_id("[PAD]")], dtype=torch.int64
+        )
     def __len__(self):
         return len(self.ds)
     def __getitem__(self, index):
         src_target_pair = self.ds[index]
+        src_text = src_target_pair["translation"][self.src_lang]
+        tgt_text = src_target_pair["translation"][self.tgt_lang]
         enc_input_tokens = self.tokenizer_src.encode(src_text).ids
         dec_input_tokens = self.tokenizer_tgt.encode(tgt_text).ids
+        enc_num_padding_tokens = (
+            self.seq_len - len(enc_input_tokens) - 2
+        )  # for SOS and EOS
+        dec_num_padding_tokens = self.seq_len - len(dec_input_tokens) - 1  # for SOS
         if enc_num_padding_tokens < 0 or dec_num_padding_tokens < 0:
+            raise ValueError("Sentence is too long")
         # Add SOS and EOS tokens to source text
         encoder_input = torch.cat(
                 self.sos_token,
                 torch.tensor(enc_input_tokens, dtype=torch.int64),
                 self.eos_token,
+                torch.tensor(
+                    [self.pad_token] * enc_num_padding_tokens, dtype=torch.int64
+                ),
             ]
         )
         # Add SOS token to the decoder input
             [
                 self.sos_token,
                 torch.tensor(dec_input_tokens, dtype=torch.int64),
+                torch.tensor(
+                    [self.pad_token] * dec_num_padding_tokens, dtype=torch.int64
+                ),
             ]
         )
         # Add EOS token to the label (what we want )
             [
                 torch.tensor(dec_input_tokens, dtype=torch.int64),
                 self.eos_token,
+                torch.tensor(
+                    [self.pad_token] * dec_num_padding_tokens, dtype=torch.int64
+                ),
             ]
         )
         assert label.size(0) == self.seq_len
         return {
+            "encoder_input": encoder_input,  # (Seq_len)
+            "decoder_input": decoder_input,  # (Seq_len)
+            "encoder_mask": (encoder_input != self.pad_token)
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .int(),  # (1, 1, Seq_len)
+            "decoder_mask": (decoder_input != self.pad_token)
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .int()
+            & casual_mask(
+                decoder_input.size(0)
+            ),  # (1, Seq_len) & (1, Seq_len, Seq_len)
+            "label": label,  # (Seq_len)
             "src_text": src_text,
+            "tgt_text": tgt_text,
         }
 def casual_mask(size):
+    mask = torch.triu(torch.ones((1, size, size)), diagonal=1).type(
+        torch.int
+    )  # Upper triangular matrix, above the main diagonal
+    return mask == 0

transformer_from_scratch/inference.ipynb CHANGED Viewed

@@ -12,7 +12,7 @@
    },
    "source": [
     "import torch\n",
-    "from config import get_config,latest_weights_file_path\n",
     "from train import get_model, get_ds, run_validation\n",
     "from translate import translate"
    ],
@@ -22,10 +22,10 @@
      "evalue": "cannot import name 'get_model' from 'train' (C:\\Users\\Alex\\Desktop\\binding_affinity\\train.py)",
      "output_type": "error",
      "traceback": [
-      "\u001B[31m---------------------------------------------------------------------------\u001B[39m",
-      "\u001B[31mImportError\u001B[39m                               Traceback (most recent call last)",
-      "\u001B[36mCell\u001B[39m\u001B[36m \u001B[39m\u001B[32mIn[2]\u001B[39m\u001B[32m, line 3\u001B[39m\n\u001B[32m      1\u001B[39m \u001B[38;5;28;01mimport\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mtorch\u001B[39;00m\n\u001B[32m      2\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mconfig\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m get_config,latest_weights_file_path\n\u001B[32m----> \u001B[39m\u001B[32m3\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mtrain\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m get_model, get_ds, run_validation\n\u001B[32m      4\u001B[39m \u001B[38;5;28;01mfrom\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[34;01mtranslate\u001B[39;00m\u001B[38;5;250m \u001B[39m\u001B[38;5;28;01mimport\u001B[39;00m translate\n",
-      "\u001B[31mImportError\u001B[39m: cannot import name 'get_model' from 'train' (C:\\Users\\Alex\\Desktop\\binding_affinity\\train.py)"
      ]
     }
    ],
@@ -42,12 +42,14 @@
     "print(\"Using device:\", device)\n",
     "config = get_config()\n",
     "train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)\n",
-    "model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)\n",
     "\n",
     "# Load the pretrained weights\n",
     "model_filename = latest_weights_file_path(config)\n",
     "state = torch.load(model_filename)\n",
-    "model.load_state_dict(state['model_state_dict'])"
    ],
    "id": "e6b0b6022c4d1c15"
   },
@@ -56,7 +58,20 @@
    "cell_type": "code",
    "outputs": [],
    "execution_count": null,
-   "source": "run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: print(msg), 0, None, num_examples=10)",
    "id": "be2c2169c183a445"
   },
   {

    },
    "source": [
     "import torch\n",
+    "from config import get_config, latest_weights_file_path\n",
     "from train import get_model, get_ds, run_validation\n",
     "from translate import translate"
    ],
      "evalue": "cannot import name 'get_model' from 'train' (C:\\Users\\Alex\\Desktop\\binding_affinity\\train.py)",
      "output_type": "error",
      "traceback": [
+      "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
+      "\u001b[31mImportError\u001b[39m                               Traceback (most recent call last)",
+      "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 3\u001b[39m\n\u001b[32m      1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtorch\u001b[39;00m\n\u001b[32m      2\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mconfig\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_config,latest_weights_file_path\n\u001b[32m----> \u001b[39m\u001b[32m3\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtrain\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m get_model, get_ds, run_validation\n\u001b[32m      4\u001b[39m \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01mtranslate\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mimport\u001b[39;00m translate\n",
+      "\u001b[31mImportError\u001b[39m: cannot import name 'get_model' from 'train' (C:\\Users\\Alex\\Desktop\\binding_affinity\\train.py)"
      ]
     }
    ],
     "print(\"Using device:\", device)\n",
     "config = get_config()\n",
     "train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)\n",
+    "model = get_model(\n",
+    "    config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()\n",
+    ").to(device)\n",
     "\n",
     "# Load the pretrained weights\n",
     "model_filename = latest_weights_file_path(config)\n",
     "state = torch.load(model_filename)\n",
+    "model.load_state_dict(state[\"model_state_dict\"])"
    ],
    "id": "e6b0b6022c4d1c15"
   },
    "cell_type": "code",
    "outputs": [],
    "execution_count": null,
+   "source": [
+    "run_validation(\n",
+    "    model,\n",
+    "    val_dataloader,\n",
+    "    tokenizer_src,\n",
+    "    tokenizer_tgt,\n",
+    "    config[\"seq_len\"],\n",
+    "    device,\n",
+    "    lambda msg: print(msg),\n",
+    "    0,\n",
+    "    None,\n",
+    "    num_examples=10,\n",
+    ")"
+   ],
    "id": "be2c2169c183a445"
   },
   {

transformer_from_scratch/train.py CHANGED Viewed

@@ -19,19 +19,24 @@ from model import build_transformer
 from config import get_weights_file_path, get_config
 import warnings
-def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device):
-    sos_idx = tokenizer_tgt.token_to_id('[SOS]')
-    eos_idx = tokenizer_tgt.token_to_id('[EOS]')
     # Precompute the encoder output and reuse it for every token we get from the decoder
     encoder_output = model.encode(source, source_mask)
     # Initialize the decoder input with the sos token
     decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
     while True:
-        if decoder_input .size(1) == max_len:
             break
         # Build mask for the target (decoder input)
-        decoder_mask = casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
         # Calculate the output of the decoder
         out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
@@ -40,15 +45,31 @@ def greedy_decode(model, source, source_mask, tokenizer_src, tokenizer_tgt, max_
         prob = model.project(out[:, -1])
         # Select the token with the highest probability (because it's a greedy search)
         _, next_word = torch.max(prob, dim=1)
-        decoder_input = torch.cat([decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1)
         if next_word == eos_idx:
             break
     return decoder_input.squeeze(0)
-def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len, device, print_msg, global_step, writer, num_examples=2):
     model.eval()
     count = 0
@@ -61,25 +82,33 @@ def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len,
     with torch.no_grad():
         for batch in validation_ds:
             count += 1
-            encoder_input = batch['encoder_input'].to(device)
-            encoder_mask = batch['encoder_mask'].to(device)
             assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"
-            model_out = greedy_decode(model, encoder_input, encoder_mask, tokenizer_src, tokenizer_tgt, max_len, device)
-            source_text = batch['src_text'][0]
-            target_text = batch['tgt_text'][0]
             model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())
             source_texts.append(source_text)
             expected.append(target_text)
             predicted.append(model_out_text)
-            print_msg('-' * console_width)
-            print_msg(f'Source: {source_text}')
-            print_msg(f'Expected: {target_text}')
-            print_msg(f'Predicted: {model_out_text}')
             if count == num_examples:
                 break
@@ -91,25 +120,22 @@ def run_validation(model, validation_ds, tokenizer_src, tokenizer_tgt, max_len,
         # Compute the char error rate
         metric = CharErrorRate()
         cer = metric(predicted, expected)
-        writer.add_scalar('validation cer', cer, global_step)
         writer.flush()
         # Compute the word error rate
         metric = WordErrorRate()
         wer = metric(predicted, expected)
-        writer.add_scalar('validation wer', wer, global_step)
         writer.flush()
         # Compute the BLEU metric
         metric = BLEUScore()
         bleu = metric(predicted, expected)
-        writer.add_scalar('validation BLEU', bleu, global_step)
         writer.flush()
 def get_all_sentences(ds, lang):
     for item in ds:
         yield item["translation"][lang]
@@ -145,84 +171,117 @@ def get_ds(config):
     val_ds_size = len(ds_raw) - train_ds_size
     train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])
-    train_ds = BilingualDataset(train_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
-    val_ds = BilingualDataset(val_ds_raw, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'], config['seq_len'])
     max_len_src = 0
     max_len_tgt = 0
     for item in ds_raw:
-        src_ids = tokenizer_src.encode(item['translation'][config['lang_src']]).ids
-        tgt_ids = tokenizer_tgt.encode(item['translation'][config['lang_tgt']]).ids
         max_len_src = max(len(src_ids), max_len_src)
         max_len_tgt = max(len(tgt_ids), max_len_tgt)
-    print(f'Max length of the source sentence: {max_len_src}')
-    print(f'Max length of the target sentence: {max_len_tgt}')
-    train_dataloader = DataLoader(train_ds, batch_size=config['batch_size'], shuffle=True)
     val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)
     return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt
 def get_model(config, vocab_src_len, vocab_tgt_len):
-    model = build_transformer(vocab_src_len, vocab_tgt_len, config['seq_len'], config['seq_len'], config['d_model'])
     return model
 def train_model(config):
     # Define the device
-    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-    print(f'using device: {device}')
-    Path(config['model_folder']).mkdir(parents=True, exist_ok=True)
     train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
-    model = get_model(config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()).to(device)
     # Tensorboard
-    writer = SummaryWriter(config['experiment_name'])
-    optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], eps=1e-9)
     initial_epoch = 0
     global_step = 0
-    if config['preload']:
-        model_filename = get_weights_file_path(config, config['preload'])
-        print(f'Preloading model {model_filename}')
         state = torch.load(model_filename)
-        initial_epoch = state['epoch'] + 1
-        optimizer.load_state_dict(state['optimizer_state_dict'])
-        global_step = state['global_step']
-    loss_fn = torch.nn.CrossEntropyLoss(ignore_index=tokenizer_src.token_to_id('[PAD]'), label_smoothing=0.1)
-    for epoch in range(initial_epoch, config['num_epochs']):
-        batch_iterator = tqdm(train_dataloader,desc=f'Processing epoch {epoch:02d}')
         for batch in batch_iterator:
             model.train()
-            encoder_input = batch['encoder_input'].to(device) # (B, Seq_len)
-            decoder_input = batch['decoder_input'].to(device) # (B, Seq_len)
-            encoder_mask = batch['encoder_mask'].to(device) # (B, 1, 1, Seq_len)
-            decoder_mask = batch['decoder_mask'].to(device) # (B, 1, Seq_len, Seq_len)
             # Run the tensors through the transformer model
-            encoder_output = model.encode(encoder_input, encoder_mask) # (B, Seq_len, d_model)
-            decoder_output = model.decode(encoder_output, encoder_mask, decoder_input, decoder_mask) # (B, Seq_len, d_model)
-            proj_output = model.project(decoder_output) # (B, Seq_len, tgt_vocab_size)
-            label = batch['label'].to(device) # (B, Seq_len)
             # (B, Seq_len, tgt_vocab_size) --> (B * Seq_len, tgt_vocab_size)
-            loss = loss_fn(proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1))
-            batch_iterator.set_postfix({f'loss' : f'{loss.item(): 6.3f}'})
             # Log the loss
-            writer.add_scalar('train loss', loss.item(), global_step)
             writer.flush()
             # Backpropagate the loss
@@ -234,23 +293,32 @@ def train_model(config):
             global_step += 1
-        run_validation(model, val_dataloader, tokenizer_src, tokenizer_tgt, config['seq_len'], device, lambda msg: batch_iterator.write(msg), global_step, writer)
         # Save the model at the end of each epoch
-        model_filename = get_weights_file_path(config, f'{epoch:02d}')
         torch.save(
             {
-                'epoch': epoch,
-                'model_state_dict': model.state_dict(),
-                'optimizer_state_dict': optimizer.state_dict(),
-                'global_step': global_step
-            }, model_filename)
 if __name__ == "__main__":
-    warnings.filterwarnings('ignore')
     config = get_config()
     train_model(config)

 from config import get_weights_file_path, get_config
 import warnings
+def greedy_decode(
+    model, source, source_mask, tokenizer_src, tokenizer_tgt, max_len, device
+):
+    sos_idx = tokenizer_tgt.token_to_id("[SOS]")
+    eos_idx = tokenizer_tgt.token_to_id("[EOS]")
     # Precompute the encoder output and reuse it for every token we get from the decoder
     encoder_output = model.encode(source, source_mask)
     # Initialize the decoder input with the sos token
     decoder_input = torch.empty(1, 1).fill_(sos_idx).type_as(source).to(device)
     while True:
+        if decoder_input.size(1) == max_len:
             break
         # Build mask for the target (decoder input)
+        decoder_mask = (
+            casual_mask(decoder_input.size(1)).type_as(source_mask).to(device)
+        )
         # Calculate the output of the decoder
         out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
         prob = model.project(out[:, -1])
         # Select the token with the highest probability (because it's a greedy search)
         _, next_word = torch.max(prob, dim=1)
+        decoder_input = torch.cat(
+            [
+                decoder_input,
+                torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device),
+            ],
+            dim=1,
+        )
         if next_word == eos_idx:
             break
     return decoder_input.squeeze(0)
+def run_validation(
+    model,
+    validation_ds,
+    tokenizer_src,
+    tokenizer_tgt,
+    max_len,
+    device,
+    print_msg,
+    global_step,
+    writer,
+    num_examples=2,
+):
     model.eval()
     count = 0
     with torch.no_grad():
         for batch in validation_ds:
             count += 1
+            encoder_input = batch["encoder_input"].to(device)
+            encoder_mask = batch["encoder_mask"].to(device)
             assert encoder_input.size(0) == 1, "Batch size must be 1 for validation"
+            model_out = greedy_decode(
+                model,
+                encoder_input,
+                encoder_mask,
+                tokenizer_src,
+                tokenizer_tgt,
+                max_len,
+                device,
+            )
+            source_text = batch["src_text"][0]
+            target_text = batch["tgt_text"][0]
             model_out_text = tokenizer_tgt.decode(model_out.detach().cpu().numpy())
             source_texts.append(source_text)
             expected.append(target_text)
             predicted.append(model_out_text)
+            print_msg("-" * console_width)
+            print_msg(f"Source: {source_text}")
+            print_msg(f"Expected: {target_text}")
+            print_msg(f"Predicted: {model_out_text}")
             if count == num_examples:
                 break
         # Compute the char error rate
         metric = CharErrorRate()
         cer = metric(predicted, expected)
+        writer.add_scalar("validation cer", cer, global_step)
         writer.flush()
         # Compute the word error rate
         metric = WordErrorRate()
         wer = metric(predicted, expected)
+        writer.add_scalar("validation wer", wer, global_step)
         writer.flush()
         # Compute the BLEU metric
         metric = BLEUScore()
         bleu = metric(predicted, expected)
+        writer.add_scalar("validation BLEU", bleu, global_step)
         writer.flush()
 def get_all_sentences(ds, lang):
     for item in ds:
         yield item["translation"][lang]
     val_ds_size = len(ds_raw) - train_ds_size
     train_ds_raw, val_ds_raw = random_split(ds_raw, [train_ds_size, val_ds_size])
+    train_ds = BilingualDataset(
+        train_ds_raw,
+        tokenizer_src,
+        tokenizer_tgt,
+        config["lang_src"],
+        config["lang_tgt"],
+        config["seq_len"],
+    )
+    val_ds = BilingualDataset(
+        val_ds_raw,
+        tokenizer_src,
+        tokenizer_tgt,
+        config["lang_src"],
+        config["lang_tgt"],
+        config["seq_len"],
+    )
     max_len_src = 0
     max_len_tgt = 0
     for item in ds_raw:
+        src_ids = tokenizer_src.encode(item["translation"][config["lang_src"]]).ids
+        tgt_ids = tokenizer_tgt.encode(item["translation"][config["lang_tgt"]]).ids
         max_len_src = max(len(src_ids), max_len_src)
         max_len_tgt = max(len(tgt_ids), max_len_tgt)
+    print(f"Max length of the source sentence: {max_len_src}")
+    print(f"Max length of the target sentence: {max_len_tgt}")
+    train_dataloader = DataLoader(
+        train_ds, batch_size=config["batch_size"], shuffle=True
+    )
     val_dataloader = DataLoader(val_ds, batch_size=1, shuffle=True)
     return train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt
 def get_model(config, vocab_src_len, vocab_tgt_len):
+    model = build_transformer(
+        vocab_src_len,
+        vocab_tgt_len,
+        config["seq_len"],
+        config["seq_len"],
+        config["d_model"],
+    )
     return model
 def train_model(config):
     # Define the device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"using device: {device}")
+    Path(config["model_folder"]).mkdir(parents=True, exist_ok=True)
     train_dataloader, val_dataloader, tokenizer_src, tokenizer_tgt = get_ds(config)
+    model = get_model(
+        config, tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size()
+    ).to(device)
     # Tensorboard
+    writer = SummaryWriter(config["experiment_name"])
+    optimizer = torch.optim.Adam(model.parameters(), lr=config["lr"], eps=1e-9)
     initial_epoch = 0
     global_step = 0
+    if config["preload"]:
+        model_filename = get_weights_file_path(config, config["preload"])
+        print(f"Preloading model {model_filename}")
         state = torch.load(model_filename)
+        initial_epoch = state["epoch"] + 1
+        optimizer.load_state_dict(state["optimizer_state_dict"])
+        global_step = state["global_step"]
+    loss_fn = torch.nn.CrossEntropyLoss(
+        ignore_index=tokenizer_src.token_to_id("[PAD]"), label_smoothing=0.1
+    )
+    for epoch in range(initial_epoch, config["num_epochs"]):
+        batch_iterator = tqdm(train_dataloader, desc=f"Processing epoch {epoch:02d}")
         for batch in batch_iterator:
             model.train()
+            encoder_input = batch["encoder_input"].to(device)  # (B, Seq_len)
+            decoder_input = batch["decoder_input"].to(device)  # (B, Seq_len)
+            encoder_mask = batch["encoder_mask"].to(device)  # (B, 1, 1, Seq_len)
+            decoder_mask = batch["decoder_mask"].to(device)  # (B, 1, Seq_len, Seq_len)
             # Run the tensors through the transformer model
+            encoder_output = model.encode(
+                encoder_input, encoder_mask
+            )  # (B, Seq_len, d_model)
+            decoder_output = model.decode(
+                encoder_output, encoder_mask, decoder_input, decoder_mask
+            )  # (B, Seq_len, d_model)
+            proj_output = model.project(decoder_output)  # (B, Seq_len, tgt_vocab_size)
+            label = batch["label"].to(device)  # (B, Seq_len)
             # (B, Seq_len, tgt_vocab_size) --> (B * Seq_len, tgt_vocab_size)
+            loss = loss_fn(
+                proj_output.view(-1, tokenizer_tgt.get_vocab_size()), label.view(-1)
+            )
+            batch_iterator.set_postfix({f"loss": f"{loss.item(): 6.3f}"})
             # Log the loss
+            writer.add_scalar("train loss", loss.item(), global_step)
             writer.flush()
             # Backpropagate the loss
             global_step += 1
+        run_validation(
+            model,
+            val_dataloader,
+            tokenizer_src,
+            tokenizer_tgt,
+            config["seq_len"],
+            device,
+            lambda msg: batch_iterator.write(msg),
+            global_step,
+            writer,
+        )
         # Save the model at the end of each epoch
+        model_filename = get_weights_file_path(config, f"{epoch:02d}")
         torch.save(
             {
+                "epoch": epoch,
+                "model_state_dict": model.state_dict(),
+                "optimizer_state_dict": optimizer.state_dict(),
+                "global_step": global_step,
+            },
+            model_filename,
+        )
 if __name__ == "__main__":
+    warnings.filterwarnings("ignore")
     config = get_config()
     train_model(config)

transformer_from_scratch/translate.py CHANGED Viewed

@@ -7,32 +7,53 @@ from dataset import BilingualDataset
 import torch
 import sys
 def translate(sentence: str):
     # Define the device, tokenizers, and model
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("Using device:", device)
     config = get_config()
-    tokenizer_src = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_src']))))
-    tokenizer_tgt = Tokenizer.from_file(str(Path(config['tokenizer_file'].format(config['lang_tgt']))))
-    model = build_transformer(tokenizer_src.get_vocab_size(), tokenizer_tgt.get_vocab_size(), config["seq_len"], config['seq_len'], d_model=config['d_model']).to(device)
     # Load the pretrained weights
     model_filename = latest_weights_file_path(config)
     state = torch.load(model_filename)
-    model.load_state_dict(state['model_state_dict'])
     # if the sentence is a number use it as an index to the test set
     label = ""
     if type(sentence) == int or sentence.isdigit():
         id = int(sentence)
-        ds = load_dataset(f"{config['datasource']}", f"{config['lang_src']}-{config['lang_tgt']}", split='all')
-        ds = BilingualDataset(ds, tokenizer_src, tokenizer_tgt, config['lang_src'], config['lang_tgt'],
-                              config['seq_len'])
-        sentence = ds[id]['src_text']
         label = ds[id]["tgt_text"]
-    seq_len = config['seq_len']
     # translate the sentence
@@ -40,46 +61,82 @@ def translate(sentence: str):
     with torch.no_grad():
         # Precompute the encoder output and reuse it for every generation step
         source = tokenizer_src.encode(sentence)
-        source = torch.cat([
-            torch.tensor([tokenizer_src.token_to_id('[SOS]')], dtype=torch.int64),
-            torch.tensor(source.ids, dtype=torch.int64),
-            torch.tensor([tokenizer_src.token_to_id('[EOS]')], dtype=torch.int64),
-            torch.tensor([tokenizer_src.token_to_id('[PAD]')] * (seq_len - len(source.ids) - 2), dtype=torch.int64)
-        ], dim=0).to(device)
-        source_mask = (source != tokenizer_src.token_to_id('[PAD]')).unsqueeze(0).unsqueeze(0).int().to(device)
         encoder_output = model.encode(source, source_mask)
         # Initialize the decoder input with the sos token
-        decoder_input = torch.empty(1, 1).fill_(tokenizer_tgt.token_to_id('[SOS]')).type_as(source).to(device)
         # Print the source sentence and target start prompt
-        if label != "": print(f"{f'ID: ':>12}{id}")
         print(f"{f'SOURCE: ':>12}{sentence}")
-        if label != "": print(f"{f'TARGET: ':>12}{label}")
-        print(f"{f'PREDICTED: ':>12}", end='')
         # Generate the translation word by word
         while decoder_input.size(1) < seq_len:
             # build mask for target and calculate output
-            decoder_mask = torch.triu(torch.ones((1, decoder_input.size(1), decoder_input.size(1))), diagonal=1).type(
-                torch.int).type_as(source_mask).to(device)
             out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
             # project next token
             prob = model.project(out[:, -1])
             _, next_word = torch.max(prob, dim=1)
             decoder_input = torch.cat(
-                [decoder_input, torch.empty(1, 1).type_as(source).fill_(next_word.item()).to(device)], dim=1)
             # print the translated word
-            print(f"{tokenizer_tgt.decode([next_word.item()])}", end=' ')
             # break if we predict the end of sentence token
-            if next_word == tokenizer_tgt.token_to_id('[EOS]'):
                 break
         # convert ids to tokens
     return tokenizer_tgt.decode(decoder_input[0].tolist())
 # read sentence from argument
-translate(sys.argv[1] if len(sys.argv) > 1 else "I am not a very good student.")

 import torch
 import sys
 def translate(sentence: str):
     # Define the device, tokenizers, and model
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     print("Using device:", device)
     config = get_config()
+    tokenizer_src = Tokenizer.from_file(
+        str(Path(config["tokenizer_file"].format(config["lang_src"])))
+    )
+    tokenizer_tgt = Tokenizer.from_file(
+        str(Path(config["tokenizer_file"].format(config["lang_tgt"])))
+    )
+    model = build_transformer(
+        tokenizer_src.get_vocab_size(),
+        tokenizer_tgt.get_vocab_size(),
+        config["seq_len"],
+        config["seq_len"],
+        d_model=config["d_model"],
+    ).to(device)
     # Load the pretrained weights
     model_filename = latest_weights_file_path(config)
     state = torch.load(model_filename)
+    model.load_state_dict(state["model_state_dict"])
     # if the sentence is a number use it as an index to the test set
     label = ""
     if type(sentence) == int or sentence.isdigit():
         id = int(sentence)
+        ds = load_dataset(
+            f"{config['datasource']}",
+            f"{config['lang_src']}-{config['lang_tgt']}",
+            split="all",
+        )
+        ds = BilingualDataset(
+            ds,
+            tokenizer_src,
+            tokenizer_tgt,
+            config["lang_src"],
+            config["lang_tgt"],
+            config["seq_len"],
+        )
+        sentence = ds[id]["src_text"]
         label = ds[id]["tgt_text"]
+    seq_len = config["seq_len"]
     # translate the sentence
     with torch.no_grad():
         # Precompute the encoder output and reuse it for every generation step
         source = tokenizer_src.encode(sentence)
+        source = torch.cat(
+            [
+                torch.tensor([tokenizer_src.token_to_id("[SOS]")], dtype=torch.int64),
+                torch.tensor(source.ids, dtype=torch.int64),
+                torch.tensor([tokenizer_src.token_to_id("[EOS]")], dtype=torch.int64),
+                torch.tensor(
+                    [tokenizer_src.token_to_id("[PAD]")]
+                    * (seq_len - len(source.ids) - 2),
+                    dtype=torch.int64,
+                ),
+            ],
+            dim=0,
+        ).to(device)
+        source_mask = (
+            (source != tokenizer_src.token_to_id("[PAD]"))
+            .unsqueeze(0)
+            .unsqueeze(0)
+            .int()
+            .to(device)
+        )
         encoder_output = model.encode(source, source_mask)
         # Initialize the decoder input with the sos token
+        decoder_input = (
+            torch.empty(1, 1)
+            .fill_(tokenizer_tgt.token_to_id("[SOS]"))
+            .type_as(source)
+            .to(device)
+        )
         # Print the source sentence and target start prompt
+        if label != "":
+            print(f"{f'ID: ':>12}{id}")
         print(f"{f'SOURCE: ':>12}{sentence}")
+        if label != "":
+            print(f"{f'TARGET: ':>12}{label}")
+        print(f"{f'PREDICTED: ':>12}", end="")
         # Generate the translation word by word
         while decoder_input.size(1) < seq_len:
             # build mask for target and calculate output
+            decoder_mask = (
+                torch.triu(
+                    torch.ones((1, decoder_input.size(1), decoder_input.size(1))),
+                    diagonal=1,
+                )
+                .type(torch.int)
+                .type_as(source_mask)
+                .to(device)
+            )
             out = model.decode(encoder_output, source_mask, decoder_input, decoder_mask)
             # project next token
             prob = model.project(out[:, -1])
             _, next_word = torch.max(prob, dim=1)
             decoder_input = torch.cat(
+                [
+                    decoder_input,
+                    torch.empty(1, 1)
+                    .type_as(source)
+                    .fill_(next_word.item())
+                    .to(device),
+                ],
+                dim=1,
+            )
             # print the translated word
+            print(f"{tokenizer_tgt.decode([next_word.item()])}", end=" ")
             # break if we predict the end of sentence token
+            if next_word == tokenizer_tgt.token_to_id("[EOS]"):
                 break
         # convert ids to tokens
     return tokenizer_tgt.decode(decoder_input[0].tolist())
 # read sentence from argument
+translate(sys.argv[1] if len(sys.argv) > 1 else "I am not a very good student.")

utils.py ADDED Viewed

	@@ -0,0 +1,308 @@

+import torch
+import numpy as np
+from torch_geometric.data import Data, Batch
+from rdkit import Chem
+from rdkit.Chem import AllChem
+import nglview as nv
+import py3Dmol
+from dataset import get_atom_features, get_protein_features
+from model_attention import BindingAffinityModel
+DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_PATH = "runs/experiment_attention20260124_104439_optuna/models/model_ep041_mse1.9153.pth"
+GAT_HEADS = 2
+HIDDEN_CHANNELS = 256
+def get_inference_data(ligand_smiles, protein_sequence, model_path=MODEL_PATH):
+    """
+    Returns:
+        - mol: RDKit molecule object with 3D coordinates
+        - importance: list of importance scores for each atom
+        - predicted_affinity: predicted binding affinity value
+    """
+    # Prepare ligand molecule with geometry RDKit
+    mol = Chem.MolFromSmiles(ligand_smiles)
+    mol = Chem.AddHs(mol)
+    AllChem.EmbedMolecule(mol, randomSeed=42)
+    # Graph data PyTorch
+    atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
+    x = torch.tensor(np.array(atom_features), dtype=torch.float)
+    edge_index = []
+    for bond in mol.GetBonds():
+        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
+        edge_index.extend([(i, j), (j, i)])
+    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
+    tokens = [get_protein_features(c) for c in protein_sequence]
+    if len(tokens) > 1200: tokens = tokens[:1200]
+    else: tokens.extend([0] * (1200 - len(tokens)))
+    protein_sequence = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(DEVICE)
+    data = Data(x=x, edge_index=edge_index)
+    batch = Batch.from_data_list([data]).to(DEVICE)
+    num_features = x.shape[1]
+    # Model loading
+    model = BindingAffinityModel(num_features, hidden_channels=HIDDEN_CHANNELS, gat_heads=GAT_HEADS).to(DEVICE)
+    model.load_state_dict(torch.load(model_path, map_location=DEVICE))
+    model.eval()
+    # Prediction
+    with torch.no_grad():
+        pred = model(batch.x, batch.edge_index, batch.batch, protein_sequence)
+        attention_weights = model.cross_attention.last_attention_weights[0]
+    # Attention importance, Max + Normalize
+    real_prot_len = len([t for t in tokens if t != 0])
+    importance = attention_weights[:, :real_prot_len].max(dim=1).values.cpu().numpy()
+    # Normalize to [0, 1]
+    if importance.max() > 0:
+        importance = (importance - importance.min()) / (importance.max() - importance.min())
+    # Noise reduction
+    importance[importance < 0.01] = 0
+    return mol, importance, pred.item()
+def get_py3dmol_view(mol, importance):
+    view = py3Dmol.view(width="100%", height="600px")
+    view.addModel(Chem.MolToMolBlock(mol), "sdf")
+    view.setBackgroundColor('white')
+    view.setStyle({}, {
+        'stick': {'radius': 0.15},
+        'sphere': {'scale': 0.25}
+    })
+    indices_sorted = np.argsort(importance)[::-1]
+    top_indices = set(indices_sorted[:15])
+    conf = mol.GetConformer()
+    for i, val in enumerate(importance):
+        if i in top_indices:
+            pos = conf.GetAtomPosition(i)
+            symbol = mol.GetAtomWithIdx(i).GetSymbol()
+            label_text = f"{i}:{symbol}:{val:.2f}"
+            view.addLabel(label_text, {
+                'position': {'x': pos.x, 'y': pos.y, 'z': pos.z},
+                'fontSize': 14,
+                'fontColor': 'white',
+                'backgroundColor': 'black',
+                'backgroundOpacity': 0.7,
+                'borderThickness': 0,
+                'inFront': True,
+                'showBackground': True
+            })
+    view.zoomTo()
+    return view
+def save_standalone_ngl_html(mol, importance, filepath):
+    pdb_block = Chem.MolToPDBBlock(mol)
+    mol_pdb = Chem.MolFromPDBBlock(pdb_block, removeHs=False)
+    for i, atom in enumerate(mol_pdb.GetAtoms()):
+        info = atom.GetPDBResidueInfo()
+        if info:
+            info.SetTempFactor(float(importance[i]) * 100)
+    final_pdb_block = Chem.MolToPDBBlock(mol_pdb)
+    final_pdb_block = final_pdb_block.replace("`", "\\`")
+    indices_sorted = np.argsort(importance)[::-1]
+    top_indices = indices_sorted[:15]
+    selection_list = [str(i) for i in top_indices]
+    selection_str = "@" + ",".join(selection_list)
+    # Защита от пустой выборки
+    if not selection_list:
+        selection_str = "@-1"
+    html_content = f"""<!DOCTYPE html>
+    <html lang="en">
+    <head>
+        <meta charset="UTF-8">
+        <title>NGL Visualization</title>
+        <script src="https://unpkg.com/ngl@2.0.0-dev.37/dist/ngl.js"></script>
+        <style>
+            html, body {{ width: 100%; height: 100%; margin: 0; padding: 0; overflow: hidden; font-family: sans-serif; }}
+            #viewport {{ width: 100%; height: 100%; }}
+            /* Стиль подсказки */
+            #tooltip {{
+                display: none;
+                position: absolute;
+                z-index: 100;
+                pointer-events: none; /* Чтобы мышь не 'застревала' на подсказке */
+                background-color: rgba(20, 20, 20, 0.9);
+                color: white;
+                padding: 8px 12px;
+                border-radius: 6px;
+                font-size: 14px;
+                box-shadow: 0 4px 6px rgba(0,0,0,0.3);
+                white-space: nowrap;
+                border: 1px solid rgba(255,255,255,0.2);
+                transition: opacity 0.1s ease;
+            }}
+            /* Панель управления */
+            #controls {{
+                position: absolute;
+                top: 20px;
+                right: 20px;
+                z-index: 50;
+                background: rgba(255, 255, 255, 0.95);
+                padding: 15px;
+                border-radius: 8px;
+                box-shadow: 0 2px 10px rgba(0,0,0,0.1);
+                display: flex;
+                align-items: center;
+            }}
+            /* Стили переключателя */
+            .switch-container {{
+                display: flex;
+                align-items: center;
+                gap: 10px;
+                cursor: pointer;
+                font-weight: bold;
+                color: #333;
+            }}
+            input[type=checkbox] {{
+                transform: scale(1.5);
+                cursor: pointer;
+            }}
+        </style>
+    </head>
+    <body>
+        <div id="controls">
+            <label class="switch-container">
+                <input type="checkbox" id="heatmapToggle" checked>
+                <span>Show Heatmap</span>
+            </label>
+        </div>
+        <div id="tooltip"></div>
+        <div id="viewport"></div>
+        <script>
+            var pdbData = `{final_pdb_block}`;
+            var selectionString = "{selection_str}";
+            var component; // Глобальная переменная для доступа к модели
+            document.addEventListener("DOMContentLoaded", function () {{
+                var stage = new NGL.Stage("viewport", {{ backgroundColor: "white" }});
+                var tooltip = document.getElementById("tooltip");
+                var toggle = document.getElementById("heatmapToggle");
+                // Загружаем данные
+                var stringBlob = new Blob([pdbData], {{type: 'text/plain'}});
+                stage.loadFile(stringBlob, {{ ext: 'pdb' }}).then(function (o) {{
+                    component = o; // Сохраняем ссылку
+                    // Рисуем начальное состояние
+                    updateVisualization();
+                    o.autoView();
+                }});
+                // --- ФУНКЦИЯ ОБНОВЛЕНИЯ ВИДА ---
+                function updateVisualization() {{
+                    if (!component) return;
+                    // Очищаем старые представления (чтобы не накладывались)
+                    component.removeAllRepresentations();
+                    var useHeatmap = toggle.checked;
+                    if (useHeatmap) {{
+                        // 1. РЕЖИМ HEATMAP
+                        component.addRepresentation("ball+stick", {{
+                            colorScheme: "bfactor",
+                            colorDomain: [20, 80],
+                            colorScale: ["blue", "white", "red"],
+                            radiusScale: 1.0
+                        }});
+                    }} else {{
+                        // 2. ОБЫЧНЫЙ РЕЖИМ (По элементам)
+                        component.addRepresentation("ball+stick", {{
+                            colorScheme: "element",
+                            radiusScale: 1.0
+                        }});
+                    }}
+                    // Добавляем метки (они нужны всегда)
+                    if (selectionString.length > 1 && selectionString !== "@-1") {{
+                        component.addRepresentation("label", {{
+                            sele: selectionString,
+                            labelType: "atomindex",
+                            color: "black",
+                            radius: 1.1,
+                            yOffset: 0.0,
+                            zOffset: 2.0,
+                            attachment: "middle_center",
+                            pickable: true // ВАЖНО: Делаем текст интерактивным
+                        }});
+                    }}
+                }}
+                // Слушаем переключатель
+                toggle.addEventListener("change", updateVisualization);
+                // --- УМНЫЙ TOOLTIP ---
+                stage.mouseControls.remove("hoverPick"); // Убираем стандартное поведение
+                stage.signals.hovered.add(function (pickingProxy) {{
+                    // Проверяем, навели ли мы на атом ИЛИ на метку (текст)
+                    // NGL возвращает pickingProxy.atom даже если мы навели на label этого атома
+                    if (pickingProxy && (pickingProxy.atom || pickingProxy.closestBondAtom)) {{
+                        var atom = pickingProxy.atom || pickingProxy.closestBondAtom;
+                        var score = atom.bfactor.toFixed(2);
+                        tooltip.innerHTML = `
+                            <div style="margin-bottom:2px;"><b>Atom ID:</b> ${{atom.index}} (${{atom.element}}:  ${{atom.atomname}})</div>
+                            <div style="color: #ffcccc;"><b>Importance:</b> ${{(score/100).toFixed(3)}}</div>
+                        `;
+                        tooltip.style.display = "block";
+                        tooltip.style.opacity = "1";
+                        // Позиционирование: сдвиг вправо и вниз, чтобы не мешать
+                        var cp = pickingProxy.canvasPosition;
+                        tooltip.style.left = (cp.x + 20) + "px";
+                        tooltip.style.top = (cp.y + 20) + "px";
+                    }} else {{
+                        // Скрываем, если увели мышь
+                        tooltip.style.display = "none";
+                        tooltip.style.opacity = "0";
+                    }}
+                }});
+                // Ресайз окна
+                window.addEventListener("resize", function(event){{
+                    stage.handleResize();
+                }}, false);
+            }});
+        </script>
+    </body>
+    </html>"""
+    with open(filepath, "w", encoding="utf-8") as f:
+        f.write(html_content)

visualization.ipynb CHANGED Viewed

@@ -2,261 +2,255 @@
  "cells": [
   {
    "cell_type": "code",
    "id": "initial_id",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.479196Z",
-     "start_time": "2025-12-05T14:02:00.003864Z"
     }
    },
-   "source": [
-    "import nglview as nv\n",
-    "import os"
-   ],
    "outputs": [
     {
      "data": {
-      "text/plain": [],
       "application/vnd.jupyter.widget-view+json": {
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "3016118bc02a458cbcb4491a27089a6a"
-      }
      },
      "metadata": {},
-     "output_type": "display_data",
-     "jetTransient": {
-      "display_id": null
-     }
     }
    ],
-   "execution_count": 1
   },
   {
    "cell_type": "code",
    "id": "d8d7978e-980a-400c-8c6a-5365990c8855",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.497753Z",
-     "start_time": "2025-12-05T14:02:00.493751Z"
     }
    },
    "source": [
     "PDBBIND_PATH = \"refined-set\""
-   ],
-   "outputs": [],
-   "execution_count": 2
   },
   {
    "cell_type": "code",
    "id": "788a6b43-c515-45c7-bc52-341d446b1a65",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.510747Z",
-     "start_time": "2025-12-05T14:02:00.505672Z"
     }
    },
    "source": [
     "EXAMPLE_PDB_ID = \"1a1e\""
-   ],
-   "outputs": [],
-   "execution_count": 3
   },
   {
    "cell_type": "code",
    "id": "e8f4bebc-845f-43e8-bc4d-ab7b649eb49c",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.523669Z",
-     "start_time": "2025-12-05T14:02:00.518519Z"
     }
    },
    "source": [
     "pdb_dir = os.path.join(PDBBIND_PATH, EXAMPLE_PDB_ID)"
-   ],
-   "outputs": [],
-   "execution_count": 4
   },
   {
    "cell_type": "code",
    "id": "24b5e435-4d8f-4505-b27c-dd6317376ed4",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.570497Z",
-     "start_time": "2025-12-05T14:02:00.565454Z"
     }
    },
    "source": [
     "protein_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_protein.pdb\")"
-   ],
-   "outputs": [],
-   "execution_count": 5
   },
   {
    "cell_type": "code",
    "id": "e7fc3539-00c0-48a2-b012-c80757fa12c4",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.584673Z",
-     "start_time": "2025-12-05T14:02:00.578982Z"
     }
    },
    "source": [
     "ligand_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_ligand.sdf\")"
-   ],
-   "outputs": [],
-   "execution_count": 6
   },
   {
    "cell_type": "code",
    "id": "9a053b99-7c01-4881-b3f7-e9b39090af9d",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.649631Z",
-     "start_time": "2025-12-05T14:02:00.591897Z"
     }
    },
    "source": [
     "view = nv.NGLWidget()"
-   ],
-   "outputs": [],
-   "execution_count": 7
   },
   {
    "cell_type": "code",
    "id": "df8c8e00-3ce6-41dd-b457-d9f50e318dad",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.779528Z",
-     "start_time": "2025-12-05T14:02:00.657448Z"
     }
    },
    "source": [
     "protein_comp = view.add_component(protein_file)"
-   ],
-   "outputs": [],
-   "execution_count": 8
   },
   {
    "cell_type": "code",
    "id": "c191fead-fef8-4077-b787-5bf9552307b1",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.802894Z",
-     "start_time": "2025-12-05T14:02:00.795534Z"
     }
    },
    "source": [
     "protein_comp.clear_representations()"
-   ],
-   "outputs": [],
-   "execution_count": 9
   },
   {
    "cell_type": "code",
    "id": "4559033a-aeda-4659-8d91-9002b5a6ecda",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.824161Z",
-     "start_time": "2025-12-05T14:02:00.817622Z"
     }
    },
-   "source": [
-    "protein_comp.add_representation('cartoon', color='blue')"
-   ],
    "outputs": [],
-   "execution_count": 10
   },
   {
    "cell_type": "code",
    "id": "73ea1a50-8463-40b8-a942-0c92d3e97a97",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.850013Z",
-     "start_time": "2025-12-05T14:02:00.840262Z"
     }
    },
    "source": [
     "ligand_comp = view.add_component(ligand_file)"
-   ],
-   "outputs": [],
-   "execution_count": 11
   },
   {
    "cell_type": "code",
    "id": "16cdb710-1ed6-4b1d-9e6a-69b7ad61a600",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.866184Z",
-     "start_time": "2025-12-05T14:02:00.859732Z"
     }
    },
    "source": [
     "ligand_comp.clear_representations()"
-   ],
-   "outputs": [],
-   "execution_count": 12
   },
   {
    "cell_type": "code",
    "id": "2193c497-f33c-4de0-86a9-6e535002fcb7",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.882846Z",
-     "start_time": "2025-12-05T14:02:00.876856Z"
     }
    },
-   "source": [
-    "ligand_comp.add_representation('ball+stick', radius=0.3)"
-   ],
    "outputs": [],
-   "execution_count": 13
   },
   {
    "cell_type": "code",
    "id": "b1cc7f44-a374-4400-b4ba-8f75101b21ce",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.903573Z",
-     "start_time": "2025-12-05T14:02:00.897038Z"
     }
    },
-   "source": [
-    "view"
-   ],
    "outputs": [
     {
      "data": {
-      "text/plain": [
-       "NGLWidget()"
-      ],
       "application/vnd.jupyter.widget-view+json": {
        "version_major": 2,
-       "version_minor": 0,
-       "model_id": "028b8398377e4869a80fba4c3d5e5921"
-      }
      },
      "metadata": {},
-     "output_type": "display_data",
-     "jetTransient": {
-      "display_id": null
-     }
     }
    ],
-   "execution_count": 14
   },
   {
    "cell_type": "code",
    "id": "5655e465-bb44-4218-a5e3-db2c5e62cd9c",
    "metadata": {
     "ExecuteTime": {
-     "end_time": "2025-12-05T14:02:00.915090Z",
-     "start_time": "2025-12-05T14:02:00.912563Z"
     }
    },
-   "source": [],
    "outputs": [],
-   "execution_count": null
   }
  ],
  "metadata": {

  "cells": [
   {
    "cell_type": "code",
+   "execution_count": 1,
    "id": "initial_id",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:36.981469Z",
+     "start_time": "2026-01-24T09:06:36.975634Z"
     }
    },
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "5077355be9e64d4f814509a151b6c8b6",
        "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": []
      },
      "metadata": {},
+     "output_type": "display_data"
     }
    ],
+   "source": [
+    "import nglview as nv\n",
+    "import os"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 2,
    "id": "d8d7978e-980a-400c-8c6a-5365990c8855",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.011231Z",
+     "start_time": "2026-01-24T09:06:37.005099Z"
     }
    },
+   "outputs": [],
    "source": [
     "PDBBIND_PATH = \"refined-set\""
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 3,
    "id": "788a6b43-c515-45c7-bc52-341d446b1a65",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.022991Z",
+     "start_time": "2026-01-24T09:06:37.016849Z"
     }
    },
+   "outputs": [],
    "source": [
     "EXAMPLE_PDB_ID = \"1a1e\""
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 4,
    "id": "e8f4bebc-845f-43e8-bc4d-ab7b649eb49c",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.041322Z",
+     "start_time": "2026-01-24T09:06:37.035944Z"
     }
    },
+   "outputs": [],
    "source": [
     "pdb_dir = os.path.join(PDBBIND_PATH, EXAMPLE_PDB_ID)"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 5,
    "id": "24b5e435-4d8f-4505-b27c-dd6317376ed4",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.064924Z",
+     "start_time": "2026-01-24T09:06:37.059278Z"
     }
    },
+   "outputs": [],
    "source": [
     "protein_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_protein.pdb\")"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 6,
    "id": "e7fc3539-00c0-48a2-b012-c80757fa12c4",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.080165Z",
+     "start_time": "2026-01-24T09:06:37.074657Z"
     }
    },
+   "outputs": [],
    "source": [
     "ligand_file = os.path.join(pdb_dir, f\"{EXAMPLE_PDB_ID}_ligand.sdf\")"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 7,
    "id": "9a053b99-7c01-4881-b3f7-e9b39090af9d",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.126934Z",
+     "start_time": "2026-01-24T09:06:37.107047Z"
     }
    },
+   "outputs": [],
    "source": [
     "view = nv.NGLWidget()"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 8,
    "id": "df8c8e00-3ce6-41dd-b457-d9f50e318dad",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.209871Z",
+     "start_time": "2026-01-24T09:06:37.140785Z"
     }
    },
+   "outputs": [],
    "source": [
     "protein_comp = view.add_component(protein_file)"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 9,
    "id": "c191fead-fef8-4077-b787-5bf9552307b1",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.243271Z",
+     "start_time": "2026-01-24T09:06:37.235380Z"
     }
    },
+   "outputs": [],
    "source": [
     "protein_comp.clear_representations()"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 10,
    "id": "4559033a-aeda-4659-8d91-9002b5a6ecda",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.276519Z",
+     "start_time": "2026-01-24T09:06:37.270030Z"
     }
    },
    "outputs": [],
+   "source": [
+    "protein_comp.add_representation(\"cartoon\", color=\"blue\")"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 11,
    "id": "73ea1a50-8463-40b8-a942-0c92d3e97a97",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.309460Z",
+     "start_time": "2026-01-24T09:06:37.299153Z"
     }
    },
+   "outputs": [],
    "source": [
     "ligand_comp = view.add_component(ligand_file)"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 12,
    "id": "16cdb710-1ed6-4b1d-9e6a-69b7ad61a600",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.340286Z",
+     "start_time": "2026-01-24T09:06:37.333802Z"
     }
    },
+   "outputs": [],
    "source": [
     "ligand_comp.clear_representations()"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 13,
    "id": "2193c497-f33c-4de0-86a9-6e535002fcb7",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.372239Z",
+     "start_time": "2026-01-24T09:06:37.365156Z"
     }
    },
    "outputs": [],
+   "source": [
+    "ligand_comp.add_representation(\"ball+stick\", radius=0.3)"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": 14,
    "id": "b1cc7f44-a374-4400-b4ba-8f75101b21ce",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.406445Z",
+     "start_time": "2026-01-24T09:06:37.398945Z"
     }
    },
    "outputs": [
     {
      "data": {
       "application/vnd.jupyter.widget-view+json": {
+       "model_id": "11e403e6733946b9b6942f47bff2464e",
        "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "NGLWidget()"
+      ]
      },
      "metadata": {},
+     "output_type": "display_data"
     }
    ],
+   "source": [
+    "view"
+   ]
   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "5655e465-bb44-4218-a5e3-db2c5e62cd9c",
    "metadata": {
     "ExecuteTime": {
+     "end_time": "2026-01-24T09:06:37.420258Z",
+     "start_time": "2026-01-24T09:06:37.416018Z"
     }
    },
    "outputs": [],
+   "source": []
   }
  ],
  "metadata": {