Spaces:

AlexSychovUN
/

BindingAffinityPrediction

Sleeping

+FROM python:3.10-slim
+WORKDIR /app
+RUN apt-get update && apt-get install -y \
+    build-essential \
+    libxrender1 \
+    libxext6 \
+    && rm -rf /var/lib/apt/lists/*
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+COPY . .
+EXPOSE 7860
+CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "7860"]

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,8 @@

+services:
+  binding-predictor:
+    build: .
+    container_name: binding-app
+    ports:
+      - "8000:7860"
+    environment:
+      - GEMINI_API_KEY=${GEMINI_API_KEY}

main.py CHANGED Viewed

@@ -5,8 +5,13 @@ from fastapi import FastAPI, Request, Form
 from fastapi.templating import Jinja2Templates
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import HTMLResponse
-from utils import get_inference_data, get_py3dmol_view, save_standalone_ngl_html, get_lipinski_properties, \
-    get_gemini_explanation
 app = FastAPI()
@@ -17,6 +22,7 @@ app.mount("/static", StaticFiles(directory="static"), name="static")
 templates = Jinja2Templates(directory="templates")
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
     return templates.TemplateResponse("index.html", {"request": request})
@@ -24,33 +30,32 @@ async def read_root(request: Request):
 @app.post("/predict", response_class=HTMLResponse)
 async def predict(
-    request: Request,
-    smiles_ligand: str = Form(...),
-    sequence_protein: str = Form(...)
 ):
     mol, importance, affinity = get_inference_data(smiles_ligand, sequence_protein)
     atom_list = []
-    sorted_indices = sorted(range(len(importance)), key=lambda k: importance[k], reverse=True)
     for idx in sorted_indices[:15]:
         val = importance[idx]
         symbol = mol.GetAtomWithIdx(idx).GetSymbol()
         icon = ""
-        if val >= 0.9: icon = "🔥"
-        elif val >= 0.7: icon = "✨"
-        elif val >= 0.5: icon = "⭐"
-        atom_list.append({
-            "id": idx,
-            "symbol": symbol,
-            "score": f"{val:.3f}",
-            "icon": icon
-        })
     unique_id = str(uuid.uuid4())
     filename_ngl = f"ngl_{unique_id}.html"
     filepath_ngl = os.path.join("html_results", filename_ngl)
@@ -71,23 +76,21 @@ async def predict(
         sequence_protein,
         f"{affinity:.2f}",
         atom_list,
-        lipinski_properties
     )
-    return templates.TemplateResponse("index.html", {
-        "request": request,
-        "result_ready": True,
-        "smiles": smiles_ligand,
-        "protein": sequence_protein,
-        "affinity": f"{affinity:.2f}",
-        "atom_list": atom_list,
-        "html_py3dmol": py3dmol_content,
-        "url_ngl": ngl_url_link,
-        "lipinski": lipinski_properties,
-        "ai_explanation": ai_explanation
-    })

 from fastapi.templating import Jinja2Templates
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import HTMLResponse
+from utils import (
+    get_inference_data,
+    get_py3dmol_view,
+    save_standalone_ngl_html,
+    get_lipinski_properties,
+    get_gemini_explanation,
+)
 app = FastAPI()
 templates = Jinja2Templates(directory="templates")
 @app.get("/", response_class=HTMLResponse)
 async def read_root(request: Request):
     return templates.TemplateResponse("index.html", {"request": request})
 @app.post("/predict", response_class=HTMLResponse)
 async def predict(
+    request: Request, smiles_ligand: str = Form(...), sequence_protein: str = Form(...)
 ):
     mol, importance, affinity = get_inference_data(smiles_ligand, sequence_protein)
     atom_list = []
+    sorted_indices = sorted(
+        range(len(importance)), key=lambda k: importance[k], reverse=True
+    )
     for idx in sorted_indices[:15]:
         val = importance[idx]
         symbol = mol.GetAtomWithIdx(idx).GetSymbol()
         icon = ""
+        if val >= 0.9:
+            icon = "🔥"
+        elif val >= 0.7:
+            icon = "✨"
+        elif val >= 0.5:
+            icon = "⭐"
+        atom_list.append(
+            {"id": idx, "symbol": symbol, "score": f"{val:.3f}", "icon": icon}
+        )
     unique_id = str(uuid.uuid4())
     filename_ngl = f"ngl_{unique_id}.html"
     filepath_ngl = os.path.join("html_results", filename_ngl)
         sequence_protein,
         f"{affinity:.2f}",
         atom_list,
+        lipinski_properties,
     )
+    return templates.TemplateResponse(
+        "index.html",
+        {
+            "request": request,
+            "result_ready": True,
+            "smiles": smiles_ligand,
+            "protein": sequence_protein,
+            "affinity": f"{affinity:.2f}",
+            "atom_list": atom_list,
+            "html_py3dmol": py3dmol_content,
+            "url_ngl": ngl_url_link,
+            "lipinski": lipinski_properties,
+            "ai_explanation": ai_explanation,
+        },
+    )

model_attention.py CHANGED Viewed

@@ -20,7 +20,7 @@ class CrossAttentionLayer(nn.Module):
         # Feedforward network for further processing, classical transformer style
         self.ff = nn.Sequential(
             nn.Linear(feature_dim, feature_dim * 4),
-            nn.GELU(), # GELU works better with transformers
             nn.Dropout(dropout),
             nn.Linear(feature_dim * 4, feature_dim),
         )

         # Feedforward network for further processing, classical transformer style
         self.ff = nn.Sequential(
             nn.Linear(feature_dim, feature_dim * 4),
+            nn.GELU(),  # GELU works better with transformers
             nn.Dropout(dropout),
             nn.Linear(feature_dim * 4, feature_dim),
         )

models/model_ep028_weighted_loss6.7715.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d64adee176c82baa3489ffbae33567c31f5cea7c7b254e782651507754e3dfc2
+size 1810686

requirements.txt CHANGED Viewed

@@ -1,10 +1,20 @@
-torch
-pytorch-lightning
-optuna
-numpy
-pandas
-rdkit
-biopython

+fastapi==0.128.0
+uvicorn[standard]==0.38.0
+python-multipart==0.0.21
+requests==2.32.5
+python-decouple==3.8
+py3Dmol==2.5.4
+numpy==2.2.6
+pandas==2.3.3
+rdkit==2025.9.1
+google-genai==1.53.0
+--extra-index-url https://download.pytorch.org/whl/cpu
+torch>=2.5.0
+torch-geometric>=2.7.0

EDA.ipynb → research/EDA.ipynb RENAMED Viewed

File without changes

{GNN_classification → research/GNN_classification}/Dataset_Preparation.py RENAMED Viewed

File without changes

{GNN_classification → research/GNN_classification}/dataset/classification/EDA.ipynb RENAMED Viewed

File without changes

{GNN_classification → research/GNN_classification}/dataset/classification/data_test.csv RENAMED Viewed

File without changes

{GNN_classification → research/GNN_classification}/dataset/classification/data_test.txt RENAMED Viewed

File without changes

{GNN_classification → research/GNN_classification}/dataset/classification/data_train.csv RENAMED Viewed

File without changes

{GNN_classification → research/GNN_classification}/dataset/classification/data_train.txt RENAMED Viewed

File without changes

{GNN_classification → research/GNN_classification}/model.py RENAMED Viewed

File without changes

{GNN_classification → research/GNN_classification}/training.py RENAMED Viewed

File without changes

GNNs__practice.ipynb → research/GNNs__practice.ipynb RENAMED Viewed

File without changes

all_inferences.py → research/all_inferences.py RENAMED Viewed

@@ -19,11 +19,14 @@ from model_attention import BindingAffinityModel
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-MODEL_PATH = "runs/experiment_attention20260124_104439_optuna/models/model_ep041_mse1.9153.pth"
 GAT_HEADS = 2
 HIDDEN_CHANNELS = 256
 def get_inference_data(ligand_smiles, protein_sequence, model_path):
     """
     Returns:
@@ -46,8 +49,10 @@ def get_inference_data(ligand_smiles, protein_sequence, model_path):
     edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
     tokens = [get_protein_features(c) for c in protein_sequence]
-    if len(tokens) > 1200: tokens = tokens[:1200]
-    else: tokens.extend([0] * (1200 - len(tokens)))
     protein_sequence = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(DEVICE)
     data = Data(x=x, edge_index=edge_index)
@@ -55,7 +60,9 @@ def get_inference_data(ligand_smiles, protein_sequence, model_path):
     num_features = x.shape[1]
     # Model loading
-    model = BindingAffinityModel(num_features, hidden_channels=HIDDEN_CHANNELS, gat_heads=GAT_HEADS).to(DEVICE)
     model.load_state_dict(torch.load(model_path, map_location=DEVICE))
     model.eval()
@@ -70,7 +77,9 @@ def get_inference_data(ligand_smiles, protein_sequence, model_path):
     # Normalize to [0, 1]
     if importance.max() > 0:
-        importance = (importance - importance.min()) / (importance.max() - importance.min())
     # Noise reduction
     importance[importance < 0.01] = 0
@@ -93,20 +102,22 @@ def print_atom_scores(mol, importance):
         print(f"Atom {idx} ({symbol}): Importance = {score:.3f} {fire}")
 def get_py3dmol(mol, importance, score):
     view = py3Dmol.view(width=1000, height=800)
     view.addModel(Chem.MolToMolBlock(mol), "sdf")
-    view.setBackgroundColor('white')
     # 1. БАЗОВЫЙ СТИЛЬ (ГРУНТОВКА)
     # Задаем единый размер для всей молекулы сразу
     # scale: 0.25 — оптимальный средний размер
-    view.setStyle({}, {
-        'stick': {'color': '#cccccc', 'radius': 0.1},
-        'sphere': {'color': '#cccccc', 'scale': 0.25}
-    })
     red_atoms = []
     orange_atoms = []
@@ -130,48 +141,65 @@ def get_py3dmol(mol, importance, score):
         if i in top_indices and val > 0.1:
             pos = conf.GetAtomPosition(i)
             symbol = mol.GetAtomWithIdx(i).GetSymbol()
-            labels_to_add.append({
-                'text': f"{i}:{symbol}:{val:.2f}",
-                'pos': {'x': pos.x, 'y': pos.y, 'z': pos.z}
-            })
     # 3. ПРИМЕНЕНИЕ СТИЛЕЙ
     # Обрати внимание: scale везде 0.25 (или 0.28, чтобы чуть выделить цветные)
     # Мы меняем ТОЛЬКО ЦВЕТ.
     if red_atoms:
-        view.addStyle({'serial': red_atoms}, {
-            'sphere': {'color': '#FF0000', 'scale': 0.28},
-            'stick': {'color': '#FF0000', 'radius': 0.12}
-        })
     if orange_atoms:
-        view.addStyle({'serial': orange_atoms}, {
-            'sphere': {'color': '#FF8C00', 'scale': 0.28},
-            'stick': {'color': '#FF8C00', 'radius': 0.12}
-        })
     if blue_atoms:
-        view.addStyle({'serial': blue_atoms}, {
-            'sphere': {'color': '#7777FF', 'scale': 0.28}
-        })
     # 4. МЕТКИ
     for label in labels_to_add:
-        view.addLabel(label['text'], {
-            'position': label['pos'],
-            'fontSize': 14,
-            'fontColor': 'white',
-            'backgroundColor': 'black',
-            'backgroundOpacity': 0.7,
-            'borderThickness': 0,
-            'inFront': True,
-            'showBackground': True
-        })
     view.zoomTo()
-    view.addLabel(f"Predicted pKd: {float(score):.2f}",
-                  {'position': {'x': -5, 'y': 10, 'z': 0}, 'backgroundColor': 'black', 'fontColor': 'white'})
     return view
@@ -190,23 +218,31 @@ def get_ngl(mol, importance):
     view = nv.NGLWidget(structure)
     view.clear_representations()
-    view.add_representation('ball+stick', colorScheme='bfactor', colorScale=['blue', 'white', 'red'], colorDomain=[10, 80], radiusScale=1.0)
     indices_sorted = np.argsort(importance)[::-1]
     top_indices = indices_sorted[:15]
     selection_str = "@" + ",".join(map(str, top_indices))
-    view.add_representation('label',
-                            selection=selection_str,  # Подписываем только избранных
-                            labelType='atomindex',  # Показывать Индекс (0, 1, 2...)
-                            color='black',  # Черный текст
-                            radius=2.0,  # Размер шрифта (попробуйте 1.5 - 3.0)
-                            zOffset=1.0)  # Чуть сдвинуть к камере
     view.center()
     return view
 if __name__ == "__main__":
     smiles = "COc1ccc(S(=O)(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc2ccccc2)NC(=O)O[C@@H]2C[C@@H]3NC(=O)O[C@@H]3C2)cc1"
     protein = "PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKMIGGIGGFIKVRQYDQIIIEIAGHKAIGTVLVGPTPVNIIGRNLLTQIGATLNF"
@@ -222,4 +258,3 @@ if __name__ == "__main__":
     ngl_widget = get_ngl(mol, importance)
     nv.write_html(file_name_ngl, ngl_widget)

 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+MODEL_PATH = (
+    "runs/experiment_attention20260124_104439_optuna/models/model_ep041_mse1.9153.pth"
+)
 GAT_HEADS = 2
 HIDDEN_CHANNELS = 256
 def get_inference_data(ligand_smiles, protein_sequence, model_path):
     """
     Returns:
     edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
     tokens = [get_protein_features(c) for c in protein_sequence]
+    if len(tokens) > 1200:
+        tokens = tokens[:1200]
+    else:
+        tokens.extend([0] * (1200 - len(tokens)))
     protein_sequence = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(DEVICE)
     data = Data(x=x, edge_index=edge_index)
     num_features = x.shape[1]
     # Model loading
+    model = BindingAffinityModel(
+        num_features, hidden_channels=HIDDEN_CHANNELS, gat_heads=GAT_HEADS
+    ).to(DEVICE)
     model.load_state_dict(torch.load(model_path, map_location=DEVICE))
     model.eval()
     # Normalize to [0, 1]
     if importance.max() > 0:
+        importance = (importance - importance.min()) / (
+            importance.max() - importance.min()
+        )
     # Noise reduction
     importance[importance < 0.01] = 0
         print(f"Atom {idx} ({symbol}): Importance = {score:.3f} {fire}")
 def get_py3dmol(mol, importance, score):
     view = py3Dmol.view(width=1000, height=800)
     view.addModel(Chem.MolToMolBlock(mol), "sdf")
+    view.setBackgroundColor("white")
     # 1. БАЗОВЫЙ СТИЛЬ (ГРУНТОВКА)
     # Задаем единый размер для всей молекулы сразу
     # scale: 0.25 — оптимальный средний размер
+    view.setStyle(
+        {},
+        {
+            "stick": {"color": "#cccccc", "radius": 0.1},
+            "sphere": {"color": "#cccccc", "scale": 0.25},
+        },
+    )
     red_atoms = []
     orange_atoms = []
         if i in top_indices and val > 0.1:
             pos = conf.GetAtomPosition(i)
             symbol = mol.GetAtomWithIdx(i).GetSymbol()
+            labels_to_add.append(
+                {
+                    "text": f"{i}:{symbol}:{val:.2f}",
+                    "pos": {"x": pos.x, "y": pos.y, "z": pos.z},
+                }
+            )
     # 3. ПРИМЕНЕНИЕ СТИЛЕЙ
     # Обрати внимание: scale везде 0.25 (или 0.28, чтобы чуть выделить цветные)
     # Мы меняем ТОЛЬКО ЦВЕТ.
     if red_atoms:
+        view.addStyle(
+            {"serial": red_atoms},
+            {
+                "sphere": {"color": "#FF0000", "scale": 0.28},
+                "stick": {"color": "#FF0000", "radius": 0.12},
+            },
+        )
     if orange_atoms:
+        view.addStyle(
+            {"serial": orange_atoms},
+            {
+                "sphere": {"color": "#FF8C00", "scale": 0.28},
+                "stick": {"color": "#FF8C00", "radius": 0.12},
+            },
+        )
     if blue_atoms:
+        view.addStyle(
+            {"serial": blue_atoms}, {"sphere": {"color": "#7777FF", "scale": 0.28}}
+        )
     # 4. МЕТКИ
     for label in labels_to_add:
+        view.addLabel(
+            label["text"],
+            {
+                "position": label["pos"],
+                "fontSize": 14,
+                "fontColor": "white",
+                "backgroundColor": "black",
+                "backgroundOpacity": 0.7,
+                "borderThickness": 0,
+                "inFront": True,
+                "showBackground": True,
+            },
+        )
     view.zoomTo()
+    view.addLabel(
+        f"Predicted pKd: {float(score):.2f}",
+        {
+            "position": {"x": -5, "y": 10, "z": 0},
+            "backgroundColor": "black",
+            "fontColor": "white",
+        },
+    )
     return view
     view = nv.NGLWidget(structure)
     view.clear_representations()
+    view.add_representation(
+        "ball+stick",
+        colorScheme="bfactor",
+        colorScale=["blue", "white", "red"],
+        colorDomain=[10, 80],
+        radiusScale=1.0,
+    )
     indices_sorted = np.argsort(importance)[::-1]
     top_indices = indices_sorted[:15]
     selection_str = "@" + ",".join(map(str, top_indices))
+    view.add_representation(
+        "label",
+        selection=selection_str,  # Подписываем только избранных
+        labelType="atomindex",  # Показывать Индекс (0, 1, 2...)
+        color="black",  # Черный текст
+        radius=2.0,  # Размер шрифта (попробуйте 1.5 - 3.0)
+        zOffset=1.0,
+    )  # Чуть сдвинуть к камере
     view.center()
     return view
 if __name__ == "__main__":
     smiles = "COc1ccc(S(=O)(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc2ccccc2)NC(=O)O[C@@H]2C[C@@H]3NC(=O)O[C@@H]3C2)cc1"
     protein = "PQITLWKRPLVTIKIGGQLKEALLDTGADDTVIEEMSLPGRWKPKMIGGIGGFIKVRQYDQIIIEIAGHKAIGTVLVGPTPVNIIGRNLLTQIGATLNF"
     ngl_widget = get_ngl(mol, importance)
     nv.write_html(file_name_ngl, ngl_widget)

research/dataset.py ADDED Viewed

	@@ -0,0 +1,209 @@

+import numpy as np
+import torch
+import pandas as pd
+from rdkit import Chem, rdBase
+from torch_geometric.data import Data
+from torch.utils.data import Dataset, random_split
+rdBase.DisableLog("rdApp.*")
+def one_of_k_encoding(x, allowable_set):
+    # last position - unknown
+    if x not in allowable_set:
+        x = allowable_set[-1]
+    return list(map(lambda s: x == s, allowable_set))
+def get_atom_features(atom):
+    symbols_list = [
+        "C",
+        "N",
+        "O",
+        "S",
+        "F",
+        "Si",
+        "P",
+        "Cl",
+        "Br",
+        "Mg",
+        "Na",
+        "Ca",
+        "Fe",
+        "As",
+        "Al",
+        "I",
+        "B",
+        "V",
+        "K",
+        "Tl",
+        "Yb",
+        "Sb",
+        "Sn",
+        "Ag",
+        "Pd",
+        "Co",
+        "Se",
+        "Ti",
+        "Zn",
+        "H",
+        "Li",
+        "Ge",
+        "Cu",
+        "Au",
+        "Ni",
+        "Cd",
+        "In",
+        "Mn",
+        "Zr",
+        "Cr",
+        "Pt",
+        "Hg",
+        "Pb",
+        "Unknown",
+    ]
+    degrees_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    numhs_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    implicit_valences_list = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
+    formal_charge_list = [-2, -1, 0, 1, 2]
+    chirality_list = [
+        Chem.rdchem.ChiralType.CHI_UNSPECIFIED,
+        Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CW,
+        Chem.rdchem.ChiralType.CHI_TETRAHEDRAL_CCW,
+        Chem.rdchem.ChiralType.CHI_OTHER,
+    ]
+    return np.array(
+        # Type of atom (Symbol)
+        one_of_k_encoding(atom.GetSymbol(), symbols_list)
+        +
+        # Number of neighbours (Degree)
+        one_of_k_encoding(atom.GetDegree(), degrees_list)
+        +
+        # Number of hydrogen atoms (Implicit Hs) - bond donors
+        one_of_k_encoding(atom.GetTotalNumHs(), numhs_list)
+        +
+        # Valence - chemical potential
+        one_of_k_encoding(atom.GetImplicitValence(), implicit_valences_list)
+        +
+        # Hybridization - so important for 3d structure, sp2 - Trigonal planar, sp3 - Tetrahedral
+        one_of_k_encoding(
+            atom.GetHybridization(),
+            [
+                Chem.rdchem.HybridizationType.SP,
+                Chem.rdchem.HybridizationType.SP2,
+                Chem.rdchem.HybridizationType.SP3,
+                Chem.rdchem.HybridizationType.SP3D,
+                Chem.rdchem.HybridizationType.SP3D2,
+                "other",
+            ],
+        )
+        +
+        # Aromaticity (Boolean)
+        [atom.GetIsAromatic()]
+        +
+        # Formal Charge
+        one_of_k_encoding(atom.GetFormalCharge(), formal_charge_list)
+        +
+        # Chirality (Geometry)
+        one_of_k_encoding(atom.GetChiralTag(), chirality_list)
+        +
+        # Is in ring (Boolean)
+        [atom.IsInRing()]
+    )
+def get_protein_features(char):
+    prot_vocab = {
+        "A": 1,
+        "R": 2,
+        "N": 3,
+        "D": 4,
+        "C": 5,
+        "Q": 6,
+        "E": 7,
+        "G": 8,
+        "H": 9,
+        "I": 10,
+        "L": 11,
+        "K": 12,
+        "M": 13,
+        "F": 14,
+        "P": 15,
+        "S": 16,
+        "T": 17,
+        "W": 18,
+        "Y": 19,
+        "V": 20,
+        "X": 21,
+        "Z": 21,
+        "B": 21,
+        "PAD": 0,
+        "UNK": 21,
+    }
+    return prot_vocab.get(char, prot_vocab["UNK"])
+class BindingDataset(Dataset):
+    def __init__(self, dataframe, max_seq_length=1000):
+        self.data = dataframe
+        self.max_seq_length = (
+            max_seq_length  # Define a maximum sequence length for padding/truncation
+        )
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        row = self.data.iloc[idx]
+        smiles = row["smiles"]
+        sequence = row["sequence"]
+        affinity = row["affinity"]
+        mol = Chem.MolFromSmiles(smiles)
+        if mol is None:
+            return None
+        # Ligand (Graph)
+        # Nodes
+        atom_features = [get_atom_features(atom) for atom in mol.GetAtoms()]
+        x = torch.tensor(np.array(atom_features), dtype=torch.float)
+        # Edges
+        edge_indexes = []
+        for bond in mol.GetBonds():
+            i = bond.GetBeginAtomIdx()
+            j = bond.GetEndAtomIdx()
+            edge_indexes.append((i, j))
+            edge_indexes.append((j, i))
+        # t - transpose, [num_of_edges, 2] -> [2, num_of_edges]
+        # contiguous - take the virtually transposed tensor and make its physical copy and lay bytes sequentially
+        edge_index = torch.tensor(edge_indexes, dtype=torch.long).t().contiguous()
+        # Protein (Sequence, tensor of integers)
+        tokens = [get_protein_features(char) for char in sequence]
+        if len(tokens) > self.max_seq_length:
+            tokens = tokens[: self.max_seq_length]
+        else:
+            tokens.extend(
+                [get_protein_features("PAD")] * (self.max_seq_length - len(tokens))
+            )
+        protein_tensor = torch.tensor(tokens, dtype=torch.long)
+        # Affinity
+        y = torch.tensor([affinity], dtype=torch.float)
+        return Data(x=x, edge_index=edge_index, protein_seq=protein_tensor, y=y)
+if __name__ == "__main__":
+    dataset = pd.read_csv("pdbbind_refined_dataset.csv")
+    dataset = BindingDataset(dataset)
+    train_size = int(0.8 * len(dataset))
+    test_size = len(dataset) - train_size
+    train_dataset, test_dataset = random_split(dataset, [train_size, test_size])
+    print(len(train_dataset))
+    print(len(test_dataset))

dataset_preparation.py → research/dataset_preparation.py RENAMED Viewed

File without changes

inference.py → research/inference.py RENAMED Viewed

File without changes

inference_attention.py → research/inference_attention.py RENAMED Viewed

File without changes

research/loss.py ADDED Viewed

	@@ -0,0 +1,18 @@

+import torch
+import torch.nn as nn
+class WeightedMSELoss(nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, prediction, target):
+        squared_errors = (prediction - target) ** 2
+        weights = torch.ones_like(target)
+        weights[target >= 6.0] = 2.0  # Fine x2 pKd > 6 good binding
+        weights[target >= 7.0] = 5.0  # Fine x5 pKd > 7 great binding
+        weights[target >= 8.0] = 10.0  # Fine x10 pKd > 8 super binding
+        weighted_loss = squared_errors * weights
+        return torch.mean(weighted_loss)

research/model.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import math
+import torch
+import torch.nn as nn
+from torch_geometric.nn import GCNConv, GATConv, global_mean_pool
+class PositionalEncoding(nn.Module):
+    def __init__(self, d_model: int, seq_len: int = 5000, dropout: float = 0.1):
+        super().__init__()
+        self.d_model = d_model
+        self.seq_len = seq_len
+        self.dropout = nn.Dropout(dropout)
+        # Create a matrix of shape (seq_len, d_model)
+        pe = torch.zeros(seq_len, d_model)
+        # Create a vector of shape (seq_len, 1)
+        position = torch.arange(0, seq_len, dtype=torch.float).unsqueeze(
+            1
+        )  # (Seq_len, 1)
+        # Compute the positional encodings once in log space.
+        div_term = torch.exp(
+            torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
+        )
+        # Apply the sin to even positions
+        pe[:, 0::2] = torch.sin(position * div_term)
+        # Apply the cos to odd positions
+        pe[:, 1::2] = torch.cos(position * div_term)
+        pe = pe.unsqueeze(0)  # (1, Seq_len, d_model) batch dimension
+        self.register_buffer("pe", pe)
+    def forward(self, x):
+        # x: [batch_size, seq_len, d_model]
+        x = x + (self.pe[:, : x.shape[1], :]).requires_grad_(False)
+        return self.dropout(x)
+# class LigandGNN(nn.Module): # GCN CONV
+#     def __init__(self, input_dim, hidden_channels):
+#         super().__init__()
+#         self.hidden_channels = hidden_channels
+#
+#         self.conv1 = GCNConv(input_dim, hidden_channels)
+#         self.conv2 = GCNConv(hidden_channels, hidden_channels)
+#         self.conv3 = GCNConv(hidden_channels, hidden_channels)
+#         self.dropout = nn.Dropout(0.2)
+#
+#     def forward(self, x, edge_index, batch):
+#         x = self.conv1(x, edge_index)
+#         x = x.relu()
+#         x = self.dropout(x)
+#
+#         x = self.conv2(x, edge_index)
+#         x = x.relu()
+#         x = self.conv3(x, edge_index)
+#         x = self.dropout(x)
+#
+#         # Averaging nodes and got the molecula vector
+#         x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]
+#         return x
+class LigandGNN(nn.Module):
+    def __init__(self, input_dim, hidden_channels, heads=4, dropout=0.2):
+        super().__init__()
+        # Heads=4 means we use 4 attention heads
+        # Concat=False, we average the heads instead of concatenating them, to keep the output dimension same as hidden_channels
+        self.conv1 = GATConv(input_dim, hidden_channels, heads=heads, concat=False)
+        self.conv2 = GATConv(
+            hidden_channels, hidden_channels, heads=heads, concat=False
+        )
+        self.conv3 = GATConv(
+            hidden_channels, hidden_channels, heads=heads, concat=False
+        )
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x, edge_index, batch):
+        x = self.conv1(x, edge_index)
+        x = x.relu()
+        x = self.dropout(x)
+        x = self.conv2(x, edge_index)
+        x = x.relu()
+        x = self.dropout(x)
+        x = self.conv3(x, edge_index)
+        # Global Mean Pooling
+        x = global_mean_pool(x, batch)
+        return x
+class ProteinTransformer(nn.Module):
+    def __init__(self, vocab_size, d_model=128, N=2, h=4, output_dim=128, dropout=0.2):
+        super().__init__()
+        self.d_model = d_model
+        self.embedding = nn.Embedding(vocab_size, d_model)
+        self.pos_encoder = PositionalEncoding(d_model, dropout=dropout)
+        encoder_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=h, batch_first=True
+        )
+        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers=N)
+        self.fc = nn.Linear(d_model, output_dim)
+    def forward(self, x):
+        # x: [batch_size, seq_len]
+        padding_mask = x == 0  # mask for PAD tokens
+        x = self.embedding(x) * math.sqrt(self.d_model)
+        x = self.pos_encoder(x)
+        x = self.transformer(x, src_key_padding_mask=padding_mask)
+        mask = (~padding_mask).float().unsqueeze(-1)
+        x = x * mask
+        sum_x = x.sum(dim=1)  # Global average pooling
+        token_counts = mask.sum(dim=1).clamp(min=1e-9)
+        x = sum_x / token_counts
+        x = self.fc(x)
+        return x
+class BindingAffinityModel(nn.Module):
+    def __init__(
+        self, num_node_features, hidden_channels=128, gat_heads=4, dropout=0.2
+    ):
+        super().__init__()
+        # Tower 1 - Ligand GNN
+        self.ligand_gnn = LigandGNN(
+            input_dim=num_node_features,
+            hidden_channels=hidden_channels,
+            heads=gat_heads,
+            dropout=dropout,
+        )
+        # Tower 2 - Protein Transformer
+        self.protein_transformer = ProteinTransformer(
+            vocab_size=26,
+            d_model=hidden_channels,
+            output_dim=hidden_channels,
+            dropout=dropout,
+        )
+        self.head = nn.Sequential(
+            nn.Linear(hidden_channels * 2, hidden_channels),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden_channels, 1),
+        )
+    def forward(self, x, edge_index, batch, protein_seq):
+        ligand_vec = self.ligand_gnn(x, edge_index, batch)
+        batch_size = batch.max().item() + 1
+        protein_seq = protein_seq.view(batch_size, -1)
+        protein_vec = self.protein_transformer(protein_seq)
+        combined = torch.cat([ligand_vec, protein_vec], dim=1)
+        return self.head(combined)

research/model_attention.py ADDED Viewed

	@@ -0,0 +1,144 @@

+import torch
+import torch.nn as nn
+from torch_geometric.nn import GATConv
+from torch_geometric.utils import to_dense_batch
+import torch.nn.functional as F
+class CrossAttentionLayer(nn.Module):
+    def __init__(self, feature_dim, num_heads=4, dropout=0.1):
+        super().__init__()
+        # Main attention layer
+        # Feature dim is the dimension of the hidden features
+        self.attention = nn.MultiheadAttention(
+            feature_dim, num_heads, dropout=dropout, batch_first=True
+        )
+        # Normalization layer for stabilizing training
+        self.norm = nn.LayerNorm(feature_dim)
+        # Feedforward network for further processing, classical transformer style
+        self.ff = nn.Sequential(
+            nn.Linear(feature_dim, feature_dim * 4),
+            nn.GELU(),  # GELU works better with transformers
+            nn.Dropout(dropout),
+            nn.Linear(feature_dim * 4, feature_dim),
+        )
+        self.norm_ff = nn.LayerNorm(feature_dim)
+        self.last_attention_weights = None
+    def forward(self, ligand_features, protein_features, key_padding_mask=None):
+        # ligand_features: [Batch, Atoms, Dim] - atoms
+        # protein_features: [Batch, Residues, Dim] - amino acids
+        # Cross attention:
+        # Query = Ligand (What we want to find out)
+        # Key, Value = Protein (Where we look for information)
+        # Result: "Ligand enriched with knowledge about proteins"
+        attention_output, attn_weights = self.attention(
+            query=ligand_features,
+            key=protein_features,
+            value=protein_features,
+            key_padding_mask=key_padding_mask,
+            need_weights=True,
+            average_attn_weights=True,
+        )
+        self.last_attention_weights = attn_weights.detach().cpu()
+        # Residual connection (x + attention(x)) and normalization
+        ligand_features = self.norm(ligand_features + attention_output)
+        # Feedforward network with residual connection and normalization
+        ff_output = self.ff(ligand_features)
+        ligand_features = self.norm_ff(ligand_features + ff_output)
+        return ligand_features
+class BindingAffinityModel(nn.Module):
+    def __init__(
+        self, num_node_features, hidden_channels=256, gat_heads=2, dropout=0.3
+    ):
+        super().__init__()
+        self.dropout = dropout
+        self.hidden_channels = hidden_channels
+        # Tower 1 - Ligand GNN with GAT layers, using 3 GAT layers, so that every atom can "see" up to 3 bonds away,
+        # Attention allows to measure the importance of the neighbours
+        self.gat1 = GATConv(
+            num_node_features, hidden_channels, heads=gat_heads, concat=False
+        )
+        self.gat2 = GATConv(
+            hidden_channels, hidden_channels, heads=gat_heads, concat=False
+        )
+        self.gat3 = GATConv(
+            hidden_channels, hidden_channels, heads=gat_heads, concat=False
+        )
+        # Tower 2 - Protein Transformer, 22 = 21 amino acids + 1 padding token PAD
+        self.protein_embedding = nn.Embedding(22, hidden_channels)
+        # Additional positional encoding (simple linear) to give the model information about the order
+        self.prot_conv = nn.Conv1d(
+            hidden_channels, hidden_channels, kernel_size=3, padding=1
+        )
+        # Cross-Attention Layer, atoms attending to amino acids
+        self.cross_attention = CrossAttentionLayer(
+            feature_dim=hidden_channels, num_heads=4, dropout=dropout
+        )
+        self.fc1 = nn.Linear(hidden_channels, hidden_channels)
+        self.fc2 = nn.Linear(hidden_channels, 1)  # Final output for regression, pKd
+    def forward(self, x, edge_index, batch, protein_seq):
+        # Ligand GNN forward pass (Graph -> Node Embeddings)
+        x = F.elu(self.gat1(x, edge_index))
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = F.elu(self.gat2(x, edge_index))
+        x = F.dropout(x, p=self.dropout, training=self.training)
+        x = F.elu(self.gat3(x, edge_index))  # [Total_Atoms, Hidden_Channels]
+        # Convert graph into tensor [Batch, Max_Atoms, Hidden_Channels]
+        # to_dense_batch adds zeros paddings where necessary to the size of the largest graph in the batch
+        ligand_dense, ligand_mask = to_dense_batch(x, batch)
+        # ligand_dense: [Batch, Max_Atoms, Hidden_Channels]
+        # ligand_mask: [Batch, Max_Atoms] True where there is real atom, False where there is padding
+        batch_size = ligand_dense.size(0)
+        protein_seq = protein_seq.view(batch_size, -1)  # [Batch, Seq_Len]
+        # Protein forward pass protein_seq: [Batch, Seq_Len]
+        p = self.protein_embedding(protein_seq)  # [Batch, Seq_Len, Hidden_Channels]
+        # A simple convolution to understand local context in amino acids
+        p = p.permute(0, 2, 1)  # Change to [Batch, Hidden_Channels, Seq_Len] for Conv1d
+        p = F.relu(self.prot_conv(p))
+        p = p.permute(0, 2, 1)  # [Batch, Seq, Hidden_Channels]
+        # Mask for protein (where PAD=0, True, but MHA needs True where IGNOREME)
+        # In Pytorch MHA, the key_padding_mask should be True where we want to ignore
+        protein_pad_mask = protein_seq == 0
+        # Cross-Attention
+        x_cross = self.cross_attention(
+            ligand_dense, p, key_padding_mask=protein_pad_mask
+        )
+        # Pooling over atoms to get a single vector per molecule, considering only real atoms, ignoring paddings
+        # ligand mask True where real atom, False where padding
+        mask_expanded = ligand_mask.unsqueeze(-1)  # [Batch, Max_Atoms, 1]
+        # Zero out the padded atom features
+        x_cross = x_cross * mask_expanded
+        # Sum the features of real atoms / number of real atoms to get the mean
+        sum_features = torch.sum(x_cross, dim=1)  # [Batch, Hidden_Channels]
+        num_atoms = torch.sum(mask_expanded, dim=1)  # [Batch, 1]
+        pooled_x = sum_features / (num_atoms + 1e-6)  # Avoid division by zero
+        # MLP Head
+        out = F.relu(self.fc1(pooled_x))
+        out = F.dropout(out, p=self.dropout, training=self.training)
+        out = self.fc2(out)
+        return out

model_pl.py → research/model_pl.py RENAMED Viewed

File without changes

optuna_train.py → research/optuna_train.py RENAMED Viewed

File without changes

optuna_train_attention.py → research/optuna_train_attention.py RENAMED Viewed

@@ -7,11 +7,12 @@ import numpy as np
 from torch_geometric.loader import DataLoader
 from torch.utils.data import random_split
 from dataset import BindingDataset
 from model_attention import BindingAffinityModel
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 N_TRIALS = 50
-MAX_EPOCHS_PER_TRIAL = 60
 LOG_DIR = "runs"
 DATA_CSV = "pdbbind_refined_dataset.csv"
@@ -88,7 +89,8 @@ def objective(trial):
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         optimizer, mode="min", factor=0.5, patience=5
     )
-    criterion = nn.MSELoss()
     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
@@ -118,7 +120,7 @@ if __name__ == "__main__":
         direction="minimize",
         pruner=optuna.pruners.MedianPruner(n_min_trials=5, n_warmup_steps=10),
         storage=storage_name,
-        study_name="binding_prediction_optimization_attentionV2",
         load_if_exists=True,
     )
     print("Start hyperparameter optimization...")

 from torch_geometric.loader import DataLoader
 from torch.utils.data import random_split
 from dataset import BindingDataset
+from loss import WeightedMSELoss
 from model_attention import BindingAffinityModel
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 N_TRIALS = 50
+MAX_EPOCHS_PER_TRIAL = 50
 LOG_DIR = "runs"
 DATA_CSV = "pdbbind_refined_dataset.csv"
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         optimizer, mode="min", factor=0.5, patience=5
     )
+    # criterion = nn.MSELoss()
+    criterion = WeightedMSELoss()
     train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
     test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
         direction="minimize",
         pruner=optuna.pruners.MedianPruner(n_min_trials=5, n_warmup_steps=10),
         storage=storage_name,
+        study_name="binding_prediction_optimization_attentionWeightedLoss",
         load_if_exists=True,
     )
     print("Start hyperparameter optimization...")

pdbbind_refined_dataset.csv → research/pdbbind_refined_dataset.csv RENAMED Viewed

File without changes

research/requirements_dev.txt ADDED Viewed

	@@ -0,0 +1,18 @@

+torch
+pytorch-lightning
+optuna
+optuna-dashboard
+tensorboard
+numpy
+pandas
+matplotlib
+seaborn
+rdkit
+biopython
+jupyter
+tqdm
+black

train.py → research/train.py RENAMED Viewed

File without changes

train_attention.py → research/train_attention.py RENAMED Viewed

@@ -6,6 +6,7 @@ import pandas as pd
 from torch.utils.data import random_split
 from torch_geometric.loader import DataLoader
 from dataset import BindingDataset
 from model_attention import BindingAffinityModel
 from tqdm import tqdm
 from torch.utils.tensorboard import SummaryWriter
@@ -13,7 +14,7 @@ import numpy as np
 from datetime import datetime
 import os
-# 2.02
 # BATCH_SIZE = 16
 # LR = 0.00035  # Reduced learning rate
 # WEIGHT_DECAY = 1e-5  # Slightly increased weight decay (regularization)
@@ -23,16 +24,27 @@ import os
 # HIDDEN_CHANNELS = 256
 # 1.90 from Optuna
 BATCH_SIZE = 16
-LR = 0.000034
-WEIGHT_DECAY = 1e-6
-DROPOUT = 0.26
 EPOCHS = 100
-HIDDEN_CHANNELS = 256
-GAT_HEADS = 2
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-LOG_DIR = f"runs/experiment_attention{datetime.now().strftime('%Y%m%d_%H%M%S')}"
 TOP_K = 3
 SAVES_DIR = LOG_DIR + "/models"
@@ -128,7 +140,8 @@ def main():
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         optimizer, mode="min", factor=0.5, patience=8
     )
-    criterion = nn.MSELoss()
     top_models = []
@@ -152,7 +165,7 @@ def main():
             end="",
         )
-        filename = f"{SAVES_DIR}/model_ep{epoch:03d}_mse{test_loss:.4f}.pth"
         torch.save(model.state_dict(), filename)
         top_models.append({"loss": test_loss, "path": filename, "epoch": epoch})
@@ -173,7 +186,7 @@ def main():
     print("Training finished.")
     print("Top models saved:")
     for i, m in enumerate(top_models):
-        print(f"{i + 1}. {m['path']} (MSE: {m['loss']:.4f})")
 if __name__ == "__main__":

 from torch.utils.data import random_split
 from torch_geometric.loader import DataLoader
 from dataset import BindingDataset
+from loss import WeightedMSELoss
 from model_attention import BindingAffinityModel
 from tqdm import tqdm
 from torch.utils.tensorboard import SummaryWriter
 from datetime import datetime
 import os
+# 2.02 default parameters
 # BATCH_SIZE = 16
 # LR = 0.00035  # Reduced learning rate
 # WEIGHT_DECAY = 1e-5  # Slightly increased weight decay (regularization)
 # HIDDEN_CHANNELS = 256
 # 1.90 from Optuna
+# BATCH_SIZE = 16
+# LR = 0.000034
+# WEIGHT_DECAY = 1e-6
+# DROPOUT = 0.26
+# EPOCHS = 100
+# HIDDEN_CHANNELS = 256
+# GAT_HEADS = 2
+# Weighted Loss
 BATCH_SIZE = 16
+LR = 0.00022
+WEIGHT_DECAY = 1e-5
+DROPOUT = 0.25
 EPOCHS = 100
+HIDDEN_CHANNELS = 128
+GAT_HEADS = 4
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+LOG_DIR = (
+    f"runs/experiment_attention{datetime.now().strftime('%Y%m%d_%H%M%S')}_weighted_loss"
+)
 TOP_K = 3
 SAVES_DIR = LOG_DIR + "/models"
     scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
         optimizer, mode="min", factor=0.5, patience=8
     )
+    # criterion = nn.MSELoss()
+    criterion = WeightedMSELoss()
     top_models = []
             end="",
         )
+        filename = f"{SAVES_DIR}/model_ep{epoch:03d}_weighted_loss{test_loss:.4f}.pth"
         torch.save(model.state_dict(), filename)
         top_models.append({"loss": test_loss, "path": filename, "epoch": epoch})
     print("Training finished.")
     print("Top models saved:")
     for i, m in enumerate(top_models):
+        print(f"{i + 1}. {m['path']} (Weighted Loss: {m['loss']:.4f})")
 if __name__ == "__main__":

train_pl.py → research/train_pl.py RENAMED Viewed

File without changes

visualization.ipynb → research/visualization.ipynb RENAMED Viewed

File without changes

utils.py CHANGED Viewed

@@ -7,21 +7,24 @@ from rdkit.Chem import Descriptors
 import py3Dmol
 from jinja2 import Environment, FileSystemLoader
 from google import genai
-from google.genai import types
 from decouple import config
 GEMINI_API_KEY = config("GEMINI_API_KEY")
-# Убираем лишние импорты (nglview тут больше не нужен для standalone HTML)
 from dataset import get_atom_features, get_protein_features
 from model_attention import BindingAffinityModel
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-# Обновите путь, если нужно
-MODEL_PATH = "runs/experiment_attention20260124_104439_optuna/models/model_ep041_mse1.9153.pth"
-GAT_HEADS = 2
-HIDDEN_CHANNELS = 256
 def get_inference_data(ligand_smiles, protein_sequence, model_path=MODEL_PATH):
@@ -45,14 +48,18 @@ def get_inference_data(ligand_smiles, protein_sequence, model_path=MODEL_PATH):
         tokens = tokens[:1200]
     else:
         tokens.extend([0] * (1200 - len(tokens)))
-    protein_sequence_tensor = torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(DEVICE)
     data = Data(x=x, edge_index=edge_index)
     batch = Batch.from_data_list([data]).to(DEVICE)
     num_features = x.shape[1]
     # Model
-    model = BindingAffinityModel(num_features, hidden_channels=HIDDEN_CHANNELS, gat_heads=GAT_HEADS).to(DEVICE)
     model.load_state_dict(torch.load(model_path, map_location=DEVICE))
     model.eval()
@@ -65,7 +72,9 @@ def get_inference_data(ligand_smiles, protein_sequence, model_path=MODEL_PATH):
     importance = attention_weights[:, :real_prot_len].max(dim=1).values.cpu().numpy()
     if importance.max() > 0:
-        importance = (importance - importance.min()) / (importance.max() - importance.min())
     importance[importance < 0.01] = 0
     return mol, importance, pred.item()
@@ -112,19 +121,16 @@ def get_lipinski_properties(mol):
         "violations": violations,
         "status_text": status,
         "css_class": css_class,
-        "bad_params": ", ".join(bad_params) if bad_params else "None"
     }
 def get_py3dmol_view(mol, importance):
     view = py3Dmol.view(width="100%", height="600px")
     view.addModel(Chem.MolToMolBlock(mol), "sdf")
-    view.setBackgroundColor('white')
-    view.setStyle({}, {
-        'stick': {'radius': 0.15},
-        'sphere': {'scale': 0.25}
-    })
     indices_sorted = np.argsort(importance)[::-1]
     top_indices = set(indices_sorted[:15])
@@ -137,16 +143,19 @@ def get_py3dmol_view(mol, importance):
             symbol = mol.GetAtomWithIdx(i).GetSymbol()
             label_text = f"{i}:{symbol}:{val:.2f}"
-            view.addLabel(label_text, {
-                'position': {'x': pos.x, 'y': pos.y, 'z': pos.z},
-                'fontSize': 14,
-                'fontColor': 'white',
-                'backgroundColor': 'black',
-                'backgroundOpacity': 0.7,
-                'borderThickness': 0,
-                'inFront': True,
-                'showBackground': True
-            })
     view.zoomTo()
     return view
@@ -166,32 +175,40 @@ def save_standalone_ngl_html(mol, importance, filepath):
     indices_sorted = np.argsort(importance)[::-1]
     top_indices = indices_sorted[:15]
     selection_list = [str(i) for i in top_indices]
     selection_str = "@" + ",".join(selection_list)
     if not selection_list:
         selection_str = "@-1"
-    env = Environment(loader=FileSystemLoader('templates'))
-    template = env.get_template('ngl_view.html')
-    rendered_html = template.render(pdb_block=final_pdb_block, selection_str=selection_str)
     with open(filepath, "w", encoding="utf-8") as f:
         f.write(rendered_html)
-def get_gemini_explanation(ligand_smiles, protein_sequence, affinity, top_atoms, lipinski):
     if not GEMINI_API_KEY:
         return "<p class='text-warning'>API Key for Gemini not found. Please set GOOGLE_API_KEY environment variable.</p>"
     # Forming a list of top important atoms for a prompt
-    atoms_desc = ", ".join([f"{a['symbol']}(idx {a['id']}, score {a['score']})" for a in top_atoms[:10]])
     # Cut a protein to not spend too many tokens
-    prot_short = protein_sequence[:100] + "..." if len(protein_sequence) > 100 else protein_sequence
     prompt = f"""
     You are an expert Computational Chemist and Drug Discovery Scientist.
@@ -216,9 +233,8 @@ def get_gemini_explanation(ligand_smiles, protein_sequence, affinity, top_atoms,
     try:
         client = genai.Client(api_key=GEMINI_API_KEY)
         response = client.models.generate_content(
-            model="gemini-2.5-flash",
-            contents=prompt
         )
         return response.text
     except Exception as e:
-        return f"<p class='text-danger'>Error generating explanation: {str(e)}</p>"

 import py3Dmol
 from jinja2 import Environment, FileSystemLoader
 from google import genai
 from decouple import config
 GEMINI_API_KEY = config("GEMINI_API_KEY")
 from dataset import get_atom_features, get_protein_features
 from model_attention import BindingAffinityModel
 DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+# MODEL_PATH = "runs/experiment_attention20260124_104439_optuna/models/model_ep041_mse1.9153.pth"
+#
+# GAT_HEADS = 2
+# HIDDEN_CHANNELS = 256
+MODEL_PATH = "runs/experiment_attention20260127_055340_weighted_loss/models/model_ep028_weighted_loss6.7715.pth"
+GAT_HEADS = 4
+HIDDEN_CHANNELS = 128
 def get_inference_data(ligand_smiles, protein_sequence, model_path=MODEL_PATH):
         tokens = tokens[:1200]
     else:
         tokens.extend([0] * (1200 - len(tokens)))
+    protein_sequence_tensor = (
+        torch.tensor(tokens, dtype=torch.long).unsqueeze(0).to(DEVICE)
+    )
     data = Data(x=x, edge_index=edge_index)
     batch = Batch.from_data_list([data]).to(DEVICE)
     num_features = x.shape[1]
     # Model
+    model = BindingAffinityModel(
+        num_features, hidden_channels=HIDDEN_CHANNELS, gat_heads=GAT_HEADS
+    ).to(DEVICE)
     model.load_state_dict(torch.load(model_path, map_location=DEVICE))
     model.eval()
     importance = attention_weights[:, :real_prot_len].max(dim=1).values.cpu().numpy()
     if importance.max() > 0:
+        importance = (importance - importance.min()) / (
+            importance.max() - importance.min()
+        )
     importance[importance < 0.01] = 0
     return mol, importance, pred.item()
         "violations": violations,
         "status_text": status,
         "css_class": css_class,
+        "bad_params": ", ".join(bad_params) if bad_params else "None",
     }
 def get_py3dmol_view(mol, importance):
     view = py3Dmol.view(width="100%", height="600px")
     view.addModel(Chem.MolToMolBlock(mol), "sdf")
+    view.setBackgroundColor("white")
+    view.setStyle({}, {"stick": {"radius": 0.15}, "sphere": {"scale": 0.25}})
     indices_sorted = np.argsort(importance)[::-1]
     top_indices = set(indices_sorted[:15])
             symbol = mol.GetAtomWithIdx(i).GetSymbol()
             label_text = f"{i}:{symbol}:{val:.2f}"
+            view.addLabel(
+                label_text,
+                {
+                    "position": {"x": pos.x, "y": pos.y, "z": pos.z},
+                    "fontSize": 14,
+                    "fontColor": "white",
+                    "backgroundColor": "black",
+                    "backgroundOpacity": 0.7,
+                    "borderThickness": 0,
+                    "inFront": True,
+                    "showBackground": True,
+                },
+            )
     view.zoomTo()
     return view
     indices_sorted = np.argsort(importance)[::-1]
     top_indices = indices_sorted[:15]
     selection_list = [str(i) for i in top_indices]
     selection_str = "@" + ",".join(selection_list)
     if not selection_list:
         selection_str = "@-1"
+    env = Environment(loader=FileSystemLoader("templates"))
+    template = env.get_template("ngl_view.html")
+    rendered_html = template.render(
+        pdb_block=final_pdb_block, selection_str=selection_str
+    )
     with open(filepath, "w", encoding="utf-8") as f:
         f.write(rendered_html)
+def get_gemini_explanation(
+    ligand_smiles, protein_sequence, affinity, top_atoms, lipinski
+):
     if not GEMINI_API_KEY:
         return "<p class='text-warning'>API Key for Gemini not found. Please set GOOGLE_API_KEY environment variable.</p>"
     # Forming a list of top important atoms for a prompt
+    atoms_desc = ", ".join(
+        [f"{a['symbol']}(idx {a['id']}, score {a['score']})" for a in top_atoms[:10]]
+    )
     # Cut a protein to not spend too many tokens
+    prot_short = (
+        protein_sequence[:100] + "..."
+        if len(protein_sequence) > 100
+        else protein_sequence
+    )
     prompt = f"""
     You are an expert Computational Chemist and Drug Discovery Scientist.
     try:
         client = genai.Client(api_key=GEMINI_API_KEY)
         response = client.models.generate_content(
+            model="gemini-2.5-flash", contents=prompt
         )
         return response.text
     except Exception as e:
+        return f"<p class='text-danger'>Error generating explanation: {str(e)}</p>"