File size: 5,701 Bytes
ea594ff
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d96e3d2
38fba51
d96e3d2
 
 
 
 
38fba51
d96e3d2
 
38fba51
ea594ff
 
 
 
d96e3d2
ea594ff
 
d96e3d2
 
 
 
 
 
 
 
38fba51
ea594ff
 
 
 
d96e3d2
ea594ff
 
38fba51
ea594ff
 
 
 
 
 
 
 
 
 
 
 
38fba51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d96e3d2
 
38fba51
ea594ff
 
 
 
38fba51
ea594ff
 
 
 
 
 
 
 
 
d96e3d2
38fba51
d96e3d2
38fba51
 
 
 
d96e3d2
 
 
 
38fba51
d96e3d2
f25aaec
 
 
d96e3d2
f25aaec
38fba51
f25aaec
 
38fba51
d96e3d2
 
 
 
 
 
38fba51
d96e3d2
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import joblib
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel, EsmTokenizer, EsmModel
from rdkit import Chem

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pretrained models
esm = "facebook/esm2_t12_35M_UR50D" # generate protein embeddings
esm_tokenizer = EsmTokenizer.from_pretrained(esm)
esm_model = EsmModel.from_pretrained(esm).to(device)
chemberta = "DeepChem/ChemBERTa-10M-MTR" # generate ligand embeddings
chemberta_tokenizer = AutoTokenizer.from_pretrained(chemberta)
chemberta_model = AutoModel.from_pretrained(chemberta).to(device)
scaler = joblib.load("scaler.pkl")
pca = joblib.load("pca.pkl")
svr = joblib.load("svr_model.pkl")

def generate_protein_embedding(protein_input, input_type):
    # Generate FASTA string from file
    if input_type == "File":
        mol = Chem.MolFromPDBFile(protein_input)
        if not mol:
            return None
        fasta = Chem.MolToFASTA(mol).splitlines()[1]
    # The input was FASTA
    else:
        fasta = protein_input.strip()
    # Generate protein embedding
    esm_model.eval()
    with torch.no_grad():
        inputs = esm_tokenizer(fasta, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
        outputs = esm_model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding

def generate_ligand_embedding(ligand_input, input_type):
    if input_type == "File":
        mol = Chem.MolFromMol2File(ligand_input)
        if not mol:
            return None
        smiles = Chem.MolToSmiles(mol)
    else:
        smiles = ligand_input.strip()
    # Generate compounds embeddings from SMILES
    chemberta_model.eval()
    with torch.no_grad():
        inputs = chemberta_tokenizer(smiles, return_tensors="pt", padding=True, truncation=True).to(device)
        outputs = chemberta_model(**inputs)
        embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
    return embedding

# Convert -logKd predicted value to Kd
def value_conversion(logKa):
    logKd = logKa * -1
    Kd = 10 ** (logKd)
    if Kd >= 1e-3:  
        return f"{Kd * 1e3:.4f} mM"  # Millimolar
    elif Kd >= 1e-6:  
        return f"{Kd * 1e6:.4f} µM"  # Micromolar
    elif Kd >= 1e-9:  
        return f"{Kd * 1e9:.4f} nM"  # Nanomolar
    else:  
        return f"{Kd * 1e12:.4f} pM"  # Picomolar

def predict_affinity(protein_file, protein_fasta, protein_type, ligand_file, ligand_smiles, ligand_type):
    # Determine protein input
    if protein_file is not None:
        protein_input = protein_file
        protein_type = "File"
    elif protein_fasta is not None:
        protein_input = protein_fasta.strip()
        protein_type = "FASTA"
    else:
        return "Error: No valid protein input provided."

    # Determine ligand input
    if ligand_file is not None:
        ligand_input = ligand_file
        ligand_type = "File"
    elif ligand_smiles is not None:
        ligand_input = ligand_smiles.strip()
        ligand_type = "SMILES"
    else:
        return "Error: No valid ligand input provided."

    # Get embeddings
    protein = generate_protein_embedding(protein_input, protein_type)
    ligand = generate_ligand_embedding(ligand_input, ligand_type)

    if protein is None:
        return "Unable to parse protein .pdb file"
    if ligand is None:
        return "Unable to parse ligand .pdb file"
        
    embedding = np.concatenate((protein, ligand), axis=1)
    # Apply scaling and PCA
    svr_input = scaler.transform(embedding)
    svr_input = pca.transform(svr_input)
    # Predict the log binding affinity
    log_prediction = svr.predict(svr_input)[0]
    affinity_value = value_conversion(log_prediction)
    return f"Predicted Binding Affinity:\nlogKa = {log_prediction:.4f}\nKd = {affinity_value}"

def update_inputs(protein_type, ligand_type):
    # Updates visibility and interactivity dynamically
    return (
        gr.update(visible=(protein_type == "File"), interactive=(protein_type == "File")),  
        gr.update(visible=(protein_type == "FASTA"), interactive=(protein_type == "FASTA")),  
        gr.update(visible=(ligand_type == "File"), interactive=(ligand_type == "File")),  
        gr.update(visible=(ligand_type == "SMILES"), interactive=(ligand_type == "SMILES"))  
    )

with gr.Blocks() as iface:
    gr.Markdown("# Predict Protein-Ligand Binding Affinity")
    gr.Markdown("Upload protein and compound files or enter FASTA/SMILES strings to predict binding affinity.")

    with gr.Row():
        protein_type = gr.Radio(["File", "FASTA"], label="Protein Input Type", value="File")
        ligand_type = gr.Radio(["File", "SMILES"], label="Ligand Input Type", value="File")

    protein_file = gr.File(label="Protein .pdb file", visible=True, interactive=True)
    protein_fasta = gr.Textbox(label="Protein FASTA sequence", visible=False, interactive=True)  

    ligand_file = gr.File(label="Ligand .mol2 file", visible=True, interactive=True)
    ligand_smiles = gr.Textbox(label="Ligand SMILES string", visible=False, interactive=True)  

    output = gr.Textbox(label="Prediction Result", lines=3)

    submit_btn = gr.Button("Predict")
    submit_btn.click(
        predict_affinity,
        inputs=[protein_file, protein_fasta, protein_type, ligand_file, ligand_smiles, ligand_type],
        outputs=output
    )

    protein_type.change(update_inputs, inputs=[protein_type, ligand_type], outputs=[protein_file, protein_fasta, ligand_file, ligand_smiles])
    ligand_type.change(update_inputs, inputs=[protein_type, ligand_type], outputs=[protein_file, protein_fasta, ligand_file, ligand_smiles])

iface.launch()