Spaces:

WSobo
/

Struct2Seq-GNN-Web-Demo

Runtime error

App Files Files Community

WSobo commited on Apr 5

Commit

cd525f1

verified ·

1 Parent(s): dbc4151

Update app.py

Browse files

Files changed (1) hide show

app.py +197 -42

app.py CHANGED Viewed

@@ -1,11 +1,13 @@
 import os
 import urllib.request
 import gradio as gr
 import torch
-import prody
 import numpy as np
-# Import your model class (Make sure model_utils.py is uploaded to your Space!)
 from model_utils import Struct2SeqGNN
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
@@ -16,7 +18,6 @@ device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 raw_github_url = "https://raw.githubusercontent.com/WSobo/Struct2Seq-GNN/main/pretrained_models/v2.0/best_model.pt"
 model_path = "best_model.pt"
-# Download weights if they aren't already cached in the Space
 if not os.path.exists(model_path):
     print("Downloading model weights from GitHub...")
     urllib.request.urlretrieve(raw_github_url, model_path)
@@ -38,56 +39,211 @@ if list(state_dict.keys())[0].startswith('module.'):
 model.load_state_dict(state_dict)
 model.eval()
-# Standard Amino Acid alphabet (Update this if your model uses a different index order!)
 AA_ALPHABET = "ACDEFGHIKLMNPQRSTVWYX"
 # ---------------------------------------------------------
-# 2. INFERENCE PIPELINE
 # ---------------------------------------------------------
-def predict_sequence(pdb_file):
-    if pdb_file is None:
-        return "Please upload a .pdb file."
-    try:
-        # Step 2.1: Silence ProDy logs to keep your server console clean
-        prody.confProDy(verbosity='none')
-        # Step 2.2: Parse the uploaded PDB
-        pdb = prody.parsePDB(pdb_file.name)
-        if pdb is None:
-            return "Error: Could not parse the PDB file."
-        # Step 2.3: Extract backbone coordinates
-        # (Grabbing C-alphas to get the sequence length and main coordinates)
-        calphas = pdb.select('calpha')
-        if calphas is None:
-            return "Error: No alpha carbons found in the PDB."
-        num_residues = len(calphas)
-        # Step 2.4: Convert coordinates to numpy/tensors
-        # coords = calphas.getCoords()  # Shape: [num_residues, 3]
-        # =====================================================================
-        # Step 2.5: YOUR GRAPH CONSTRUCTION GOES HERE
-        # Copy the exact logic you use in `utils.dataset.Struct2SeqDataset`
-        # to turn these coordinates into your graph components (x, edge_index).
-        # =====================================================================
-        # NOTE: Delete this placeholder dummy block once your logic is in!
-        dummy_logits = torch.randn((num_residues, 21)).to(device)
-        # Step 2.6: Run the forward pass
         with torch.no_grad():
-            # logits = model(x, edge_index, ...)
-            logits = dummy_logits # Placeholder
-        # Step 2.7: Decode logits to an amino acid string
-        # Argmax gets the index of the highest probability AA for each residue
         predicted_indices = torch.argmax(logits, dim=-1).cpu().numpy()
-        # Map indices back to the alphabet characters
         predicted_seq = "".join([AA_ALPHABET[idx] for idx in predicted_indices])
         return f"Predicted Sequence ({num_residues} residues):\n\n{predicted_seq}"
@@ -95,9 +251,8 @@ def predict_sequence(pdb_file):
     except Exception as e:
         return f"Error processing PDB: {str(e)}"
 # ---------------------------------------------------------
-# 3. GRADIO UI
 # ---------------------------------------------------------
 demo = gr.Interface(
     fn=predict_sequence,
@@ -105,12 +260,12 @@ demo = gr.Interface(
     outputs=gr.Textbox(label="Designed Amino Acid Sequence", show_copy_button=True, lines=5),
     title="Struct2Seq-GNN: Inverse Protein Folding",
     description=(
-        "Upload a 3D target backbone to generate a sequence optimized by a custom Graph Neural Network.\n\n"
         "**Model Performance:** Achieves ~30.3% global sequence recovery and **35.1% binding-pocket recovery** "
         "on noisy coordinates, confirming strong generalization to underlying biophysical folding constraints."
     ),
     allow_flagging="never",
-    theme=gr.themes.Soft() # Adds a cleaner, more professional look
 )
 if __name__ == "__main__":

 import os
 import urllib.request
+import importlib.util
 import gradio as gr
 import torch
 import numpy as np
+from torch_geometric.data import HeteroData
+from torch_geometric.nn import radius_graph, radius
+# Import your model class (Make sure model_utils.py is in your Space!)
 from model_utils import Struct2SeqGNN
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 raw_github_url = "https://raw.githubusercontent.com/WSobo/Struct2Seq-GNN/main/pretrained_models/v2.0/best_model.pt"
 model_path = "best_model.pt"
 if not os.path.exists(model_path):
     print("Downloading model weights from GitHub...")
     urllib.request.urlretrieve(raw_github_url, model_path)
 model.load_state_dict(state_dict)
 model.eval()
+# Standard Amino Acid alphabet
 AA_ALPHABET = "ACDEFGHIKLMNPQRSTVWYX"
 # ---------------------------------------------------------
+# 2. DATA PROCESSING PIPELINE (PyG HeteroData)
 # ---------------------------------------------------------
+def _load_ligandmpnn_parsers():
+    """Load LigandMPNN parser functions directly from the HF Space root."""
+    parser_file = "data_utils.py"
+    if not os.path.exists(parser_file):
+        raise ImportError(
+            "Could not find data_utils.py. "
+            "Please upload the LigandMPNN data_utils.py file to the root of your Hugging Face Space."
+        )
+    spec = importlib.util.spec_from_file_location("ligandmpnn_data_utils", parser_file)
+    module = importlib.util.module_from_spec(spec)
+    assert spec.loader is not None
+    spec.loader.exec_module(module)
+    return module.parse_PDB, module.featurize
+parse_PDB, featurize = _load_ligandmpnn_parsers()
+def get_ligandmpnn_features(pdb_path, device="cpu"):
+    protein_dict, backbone, other_atoms, icodes, _ = parse_PDB(pdb_path, device=device)
+    if "chain_letters" in protein_dict:
+        protein_dict["chain_mask"] = torch.ones(
+            len(protein_dict["chain_letters"]),
+            dtype=torch.int32,
+            device=device
+        )
+    feature_dict = featurize(protein_dict, cutoff_for_score=8.0)
+    feature_dict["ligand_Y"] = protein_dict.get("Y", None)
+    feature_dict["ligand_Y_t"] = protein_dict.get("Y_t", None)
+    feature_dict["ligand_Y_m"] = protein_dict.get("Y_m", None)
+    return feature_dict
+def compute_dihedrals(X):
+    N = X[:, 0, :]
+    CA = X[:, 1, :]
+    C = X[:, 2, :]
+    C_prev = torch.cat([C[0:1], C[:-1]], dim=0)
+    N_next = torch.cat([N[1:], N[-1:]], dim=0)
+    CA_next = torch.cat([CA[1:], CA[-1:]], dim=0)
+    def dihedral(p0, p1, p2, p3):
+        b0 = p0 - p1
+        b1 = p2 - p1
+        b2 = p3 - p2
+        b1_norm = b1 / (torch.linalg.norm(b1, dim=-1, keepdim=True) + 1e-7)
+        n1 = torch.linalg.cross(b0, b1_norm, dim=-1)
+        n2 = torch.linalg.cross(b1_norm, b2, dim=-1)
+        m = torch.linalg.cross(n1, b1_norm, dim=-1)
+        x = torch.sum(n1 * n2, dim=-1)
+        y = torch.sum(m * n2, dim=-1)
+        return torch.atan2(y, x)
+    phi = dihedral(C_prev, N, CA, C)
+    psi = dihedral(N, CA, C, N_next)
+    omega = dihedral(CA, C, N_next, CA_next)
+    dihedrals = torch.stack([phi, psi, omega], dim=-1)
+    return torch.cat([torch.sin(dihedrals), torch.cos(dihedrals)], dim=-1)
+def encode_ligand_elements(element_ids):
+    M = element_ids.shape[0]
+    one_hot = torch.zeros((M, 6), dtype=torch.float32, device=element_ids.device)
+    mask_C = (element_ids == 6)
+    mask_N = (element_ids == 7)
+    mask_O = (element_ids == 8)
+    mask_S = (element_ids == 16)
+    mask_P = (element_ids == 15)
+    one_hot[mask_C, 0] = 1.0
+    one_hot[mask_N, 1] = 1.0
+    one_hot[mask_O, 2] = 1.0
+    one_hot[mask_S, 3] = 1.0
+    one_hot[mask_P, 4] = 1.0
+    mask_other = ~(mask_C | mask_N | mask_O | mask_S | mask_P)
+    one_hot[mask_other, 5] = 1.0
+    return one_hot
+def dict_to_pyg_data(feature_dict, radius_cutoff=8.0):
+    data = HeteroData()
+    # 1. Build Protein Nodes
+    X = feature_dict["X"].squeeze(0)
+    if X.dim() == 3 and X.size(1) >= 4:
+        ca_coords = X[:, 1, :]
+    else:
+        ca_coords = X
+    sequence_labels = feature_dict["S"].squeeze(0)
+    mask = feature_dict["mask"].squeeze(0).bool()
+    dihedral_features = compute_dihedrals(X)
+    ca_coords = ca_coords[mask]
+    sequence_labels = sequence_labels[mask]
+    dihedral_features = dihedral_features[mask]
+    data['protein'].x = dihedral_features.clone().float()
+    data['protein'].pos = ca_coords.clone().float()
+    data['protein'].y = sequence_labels.long()
+    if "chain_M" in feature_dict:
+        data['protein'].chain_M = feature_dict["chain_M"].squeeze(0)[mask]
+    p_pos = data['protein'].pos
+    pp_edge_index = radius_graph(p_pos, r=radius_cutoff, loop=False)
+    p_row, p_col = pp_edge_index
+    pp_dist = torch.norm(p_pos[p_row] - p_pos[p_col], dim=1, p=2).unsqueeze(-1)
+    data['protein', 'interacts_with', 'protein'].edge_index = pp_edge_index
+    data['protein', 'interacts_with', 'protein'].edge_attr = pp_dist
+    # 2. Build Ligand Nodes
+    Y = feature_dict.get("ligand_Y")
+    Y_t = feature_dict.get("ligand_Y_t")
+    Y_m = feature_dict.get("ligand_Y_m")
+    num_ligand_atoms = 0
+    if Y is not None and Y_m is not None:
+        Y_mask = Y_m.bool()
+        if Y_mask.sum() > 0:
+            Y = Y[Y_mask]
+            Y_t = Y_t[Y_mask]
+            num_ligand_atoms = Y.shape[0]
+            lig_x = encode_ligand_elements(Y_t)
+            data['ligand'].x = lig_x
+            data['ligand'].pos = Y.float()
+    if num_ligand_atoms > 0:
+        l_pos = data['ligand'].pos
+        pl_edge_index = radius(l_pos, p_pos, r=radius_cutoff)
+        if pl_edge_index.size(1) > 0:
+            p_idx, l_idx = pl_edge_index[0], pl_edge_index[1]
+            lp_edge_index = torch.stack([l_idx, p_idx], dim=0)
+            lp_dist = torch.norm(l_pos[l_idx] - p_pos[p_idx], dim=1, p=2).unsqueeze(-1)
+            data['ligand', 'binds', 'protein'].edge_index = lp_edge_index
+            data['ligand', 'binds', 'protein'].edge_attr = lp_dist
+            pl_edge_index_rev = torch.stack([p_idx, l_idx], dim=0)
+            data['protein', 'binds', 'ligand'].edge_index = pl_edge_index_rev
+            data['protein', 'binds', 'ligand'].edge_attr = lp_dist.clone()
+        else:
+            data['ligand', 'binds', 'protein'].edge_index = torch.empty((2, 0), dtype=torch.long)
+            data['ligand', 'binds', 'protein'].edge_attr = torch.empty((0, 1), dtype=torch.float32)
+            data['protein', 'binds', 'ligand'].edge_index = torch.empty((2, 0), dtype=torch.long)
+            data['protein', 'binds', 'ligand'].edge_attr = torch.empty((0, 1), dtype=torch.float32)
+    else:
+        data['ligand'].x = torch.empty((0, 6), dtype=torch.float32)
+        data['ligand'].pos = torch.empty((0, 3), dtype=torch.float32)
+        data['ligand', 'binds', 'protein'].edge_index = torch.empty((2, 0), dtype=torch.long)
+        data['ligand', 'binds', 'protein'].edge_attr = torch.empty((0, 1), dtype=torch.float32)
+        data['protein', 'binds', 'ligand'].edge_index = torch.empty((2, 0), dtype=torch.long)
+        data['protein', 'binds', 'ligand'].edge_attr = torch.empty((0, 1), dtype=torch.float32)
+    return data
+def pdb_to_pyg_data(pdb_path, radius=8.0, device="cpu"):
+    feature_dict = get_ligandmpnn_features(pdb_path, device=device)
+    data = dict_to_pyg_data(feature_dict, radius_cutoff=radius)
+    return data
+# ---------------------------------------------------------
+# 3. INFERENCE ENDPOINT
+# ---------------------------------------------------------
+def predict_sequence(pdb_file):
+    if pdb_file is None:
+        return "Please upload a .pdb file."
+    try:
+        # Build the Heterogeneous Graph
+        data = pdb_to_pyg_data(pdb_file.name, device=device)
+        data = data.to(device)
+        num_residues = data['protein'].x.shape[0]
+        # Run the forward pass
         with torch.no_grad():
+            # Adjust the arguments here if your forward method signature differs!
+            logits = model(data.x_dict, data.edge_index_dict, data.edge_attr_dict)
+        # Decode logits to an amino acid string
         predicted_indices = torch.argmax(logits, dim=-1).cpu().numpy()
         predicted_seq = "".join([AA_ALPHABET[idx] for idx in predicted_indices])
         return f"Predicted Sequence ({num_residues} residues):\n\n{predicted_seq}"
     except Exception as e:
         return f"Error processing PDB: {str(e)}"
 # ---------------------------------------------------------
+# 4. GRADIO UI
 # ---------------------------------------------------------
 demo = gr.Interface(
     fn=predict_sequence,
     outputs=gr.Textbox(label="Designed Amino Acid Sequence", show_copy_button=True, lines=5),
     title="Struct2Seq-GNN: Inverse Protein Folding",
     description=(
+        "Upload a 3D target backbone to generate a sequence optimized by a custom Heterogeneous Graph Neural Network.\n\n"
         "**Model Performance:** Achieves ~30.3% global sequence recovery and **35.1% binding-pocket recovery** "
         "on noisy coordinates, confirming strong generalization to underlying biophysical folding constraints."
     ),
     allow_flagging="never",
+    theme=gr.themes.Soft()
 )
 if __name__ == "__main__":