Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -45,7 +45,7 @@ import panel as pn
|
|
| 45 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 46 |
from tinydb import TinyDB, Query
|
| 47 |
|
| 48 |
-
import swifter
|
| 49 |
from tqdm.auto import tqdm
|
| 50 |
|
| 51 |
from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
|
|
@@ -837,7 +837,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
|
|
| 837 |
if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
|
| 838 |
x2 = prediction_df['X2'].iloc[0]
|
| 839 |
pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
|
| 840 |
-
pos_compounds_df['FP'] = pos_compounds_df['X1'].
|
| 841 |
|
| 842 |
@cache
|
| 843 |
def max_sim(smiles):
|
|
@@ -846,13 +846,13 @@ def apply_advanced_opts(prediction_df, opts, df_training):
|
|
| 846 |
prediction_df[[
|
| 847 |
'Max. Tanimoto Similarity to Known Ligands',
|
| 848 |
'Max. Sim. Ligand'
|
| 849 |
-
]] = prediction_df['X1'].
|
| 850 |
|
| 851 |
max_sim.cache_clear()
|
| 852 |
|
| 853 |
if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
|
| 854 |
x2 = prediction_df['X2'].iloc[0]
|
| 855 |
-
prediction_df['X1^'] = prediction_df['X1'].
|
| 856 |
|
| 857 |
@cache
|
| 858 |
def max_id(compound):
|
|
@@ -861,7 +861,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
|
|
| 861 |
|
| 862 |
prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
|
| 863 |
'Max. Id. Target']] = (
|
| 864 |
-
prediction_df['X1^'].
|
| 865 |
)
|
| 866 |
prediction_df.drop(['X1^'], axis=1, inplace=True)
|
| 867 |
|
|
@@ -870,7 +870,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
|
|
| 870 |
# Advanced options for Target Protein Identification
|
| 871 |
if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
|
| 872 |
x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
|
| 873 |
-
df_training['FP'] = df_training['X1'].
|
| 874 |
|
| 875 |
prediction_df[[
|
| 876 |
'Max. Tanimoto Similarity to Training Compounds',
|
|
@@ -888,7 +888,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
|
|
| 888 |
prediction_df[[
|
| 889 |
'Max. Sequence Identity to Known Targets of Input Compound',
|
| 890 |
'Max. Id. Target'
|
| 891 |
-
]] = prediction_df['X2'].
|
| 892 |
|
| 893 |
max_id.cache_clear()
|
| 894 |
|
|
@@ -904,7 +904,7 @@ def apply_advanced_opts(prediction_df, opts, df_training):
|
|
| 904 |
prediction_df[[
|
| 905 |
'Max. Tanimoto Similarity to Known Ligands of Identified Target',
|
| 906 |
'Max. Sim. Ligand'
|
| 907 |
-
]] = prediction_df['X2'].
|
| 908 |
|
| 909 |
max_sim.cache_clear()
|
| 910 |
|
|
@@ -949,12 +949,12 @@ def submit_predict(predict_filepath, task, preset, target_family, opts, job_info
|
|
| 949 |
orig_df['Target Family'] = None
|
| 950 |
if orig_df['Target Family'].isna().any():
|
| 951 |
orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
|
| 952 |
-
orig_df.loc[orig_df['Target Family'].isna(), 'X2'].
|
| 953 |
)
|
| 954 |
orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
|
| 955 |
detect_family.cache_clear()
|
| 956 |
|
| 957 |
-
orig_df['X1^'] = orig_df['X1'].
|
| 958 |
|
| 959 |
orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
|
| 960 |
annotated_df = orig_df[~orig_df['Y'].isna()].copy()
|
|
@@ -1109,10 +1109,10 @@ def update_df(file, progress=gr.Progress(track_tqdm=True)):
|
|
| 1109 |
|
| 1110 |
if 'X1' in df.columns:
|
| 1111 |
if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
|
| 1112 |
-
df['Compound'] = df['X1'].
|
| 1113 |
lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
|
| 1114 |
-
df['Scaffold'] = df['Compound'].
|
| 1115 |
-
df['Scaffold SMILES'] = df['Scaffold'].
|
| 1116 |
df['Pharmacophore'] = None
|
| 1117 |
if task == 'Compound-Protein Binding Affinity':
|
| 1118 |
# Convert Y^ from pIC50 (nM) to IC50 (nM)
|
|
@@ -1182,17 +1182,17 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
| 1182 |
columns_unique = None
|
| 1183 |
|
| 1184 |
if 'Exclude Pharmacophore 3D' not in opts:
|
| 1185 |
-
df_html['Pharmacophore'] = df_html['Compound'].
|
| 1186 |
lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
|
| 1187 |
|
| 1188 |
if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
|
| 1189 |
-
df_html['Compound'] = df_html['Compound'].
|
| 1190 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
| 1191 |
else:
|
| 1192 |
df_html.drop(['Compound'], axis=1, inplace=True)
|
| 1193 |
|
| 1194 |
if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
|
| 1195 |
-
df_html['Scaffold'] = df_html['Scaffold'].
|
| 1196 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
| 1197 |
else:
|
| 1198 |
df_html.drop(['Scaffold'], axis=1, inplace=True)
|
|
@@ -1227,7 +1227,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
| 1227 |
df_html.rename(columns=column_aliases, inplace=True)
|
| 1228 |
df_html.index.name = 'Index'
|
| 1229 |
if 'Target FASTA' in df_html.columns:
|
| 1230 |
-
df_html['Target FASTA'] = df_html['Target FASTA'].
|
| 1231 |
lambda x: wrap_text(x) if not pd.isna(x) else x)
|
| 1232 |
|
| 1233 |
num_cols = df_html.select_dtypes('number').columns
|
|
@@ -1247,7 +1247,7 @@ def create_html_report(df, file=None, task=None, opts=(), progress=gr.Progress(t
|
|
| 1247 |
if 'Target ID' in df_html.columns:
|
| 1248 |
df_html.drop(['Target FASTA'], axis=1, inplace=True)
|
| 1249 |
if 'Target FASTA' in df_html.columns:
|
| 1250 |
-
df_html['Target FASTA'] = df_html['Target FASTA'].
|
| 1251 |
lambda x: wrap_text(x) if not pd.isna(x) else x)
|
| 1252 |
if 'Scaffold SMILES' in df_html.columns:
|
| 1253 |
df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
|
|
@@ -1555,11 +1555,11 @@ def submit_report(df, score_list, filter_list, opt_list, task, progress=gr.Progr
|
|
| 1555 |
df_report = df.copy()
|
| 1556 |
try:
|
| 1557 |
for filter_name in filter_list:
|
| 1558 |
-
df_report[filter_name] = df_report['Compound'].
|
| 1559 |
lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
|
| 1560 |
|
| 1561 |
for score_name in score_list:
|
| 1562 |
-
df_report[score_name] = df_report['Compound'].
|
| 1563 |
lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
|
| 1564 |
|
| 1565 |
if opt_list:
|
|
@@ -2263,7 +2263,7 @@ higher similarities usually correspond to greater prediction confidence.<br>
|
|
| 2263 |
alignment = aligner.align(processed_fasta, query)
|
| 2264 |
return alignment.score / max(len(processed_fasta), len(query))
|
| 2265 |
|
| 2266 |
-
alignment_df['score'] = alignment_df['X2'].
|
| 2267 |
row = alignment_df.loc[alignment_df['score'].idxmax()]
|
| 2268 |
family = str(row['Target Family']).title()
|
| 2269 |
return gr.Dropdown(value=family,
|
|
@@ -2595,13 +2595,13 @@ QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
|
|
| 2595 |
infer_df = pd.read_csv(drug_target_pair_upload)
|
| 2596 |
validate_columns(infer_df, ['X1', 'X2'])
|
| 2597 |
|
| 2598 |
-
infer_df['X1_ERR'] = infer_df['X1'].
|
| 2599 |
validate_seq_str, regex=SMILES_PAT)
|
| 2600 |
if not infer_df['X1_ERR'].isna().all():
|
| 2601 |
raise ValueError(
|
| 2602 |
f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
|
| 2603 |
|
| 2604 |
-
infer_df['X2_ERR'] = infer_df['X2'].
|
| 2605 |
validate_seq_str, regex=FASTA_PAT)
|
| 2606 |
if not infer_df['X2_ERR'].isna().all():
|
| 2607 |
raise ValueError(
|
|
|
|
| 45 |
from apscheduler.schedulers.background import BackgroundScheduler
|
| 46 |
from tinydb import TinyDB, Query
|
| 47 |
|
| 48 |
+
#import swifter
|
| 49 |
from tqdm.auto import tqdm
|
| 50 |
|
| 51 |
from deepscreen.data.dti import validate_seq_str, rdkit_canonicalize, FASTA_PAT, SMILES_PAT
|
|
|
|
| 837 |
if "Calculate Max. Tanimoto Similarity between the Hit Compound and Known Ligands of the Input Target" in opts:
|
| 838 |
x2 = prediction_df['X2'].iloc[0]
|
| 839 |
pos_compounds_df = df_training.loc[(df_training['X2'] == x2) & (df_training['Y'] == 1)].copy()
|
| 840 |
+
pos_compounds_df['FP'] = pos_compounds_df['X1'].parallel_apply(smiles_to_ecfp)
|
| 841 |
|
| 842 |
@cache
|
| 843 |
def max_sim(smiles):
|
|
|
|
| 846 |
prediction_df[[
|
| 847 |
'Max. Tanimoto Similarity to Known Ligands',
|
| 848 |
'Max. Sim. Ligand'
|
| 849 |
+
]] = prediction_df['X1'].parallel_apply(max_sim).apply(pd.Series)
|
| 850 |
|
| 851 |
max_sim.cache_clear()
|
| 852 |
|
| 853 |
if "Calculate Max. Sequence Identity between the Input Target and Known Targets of Hit Compound" in opts:
|
| 854 |
x2 = prediction_df['X2'].iloc[0]
|
| 855 |
+
prediction_df['X1^'] = prediction_df['X1'].parallel_apply(rdkit_canonicalize)
|
| 856 |
|
| 857 |
@cache
|
| 858 |
def max_id(compound):
|
|
|
|
| 861 |
|
| 862 |
prediction_df[['Max. Sequence Identity to Known Targets of Hit Compound',
|
| 863 |
'Max. Id. Target']] = (
|
| 864 |
+
prediction_df['X1^'].parallel_apply(max_id).apply(pd.Series)
|
| 865 |
)
|
| 866 |
prediction_df.drop(['X1^'], axis=1, inplace=True)
|
| 867 |
|
|
|
|
| 870 |
# Advanced options for Target Protein Identification
|
| 871 |
if "Calculate Max. Tanimoto Similarity between the Input Compound and Compounds in the Training Set" in opts:
|
| 872 |
x1 = rdkit_canonicalize(prediction_df['X1'].iloc[0])
|
| 873 |
+
df_training['FP'] = df_training['X1'].parallel_apply(smiles_to_ecfp)
|
| 874 |
|
| 875 |
prediction_df[[
|
| 876 |
'Max. Tanimoto Similarity to Training Compounds',
|
|
|
|
| 888 |
prediction_df[[
|
| 889 |
'Max. Sequence Identity to Known Targets of Input Compound',
|
| 890 |
'Max. Id. Target'
|
| 891 |
+
]] = prediction_df['X2'].parallel_apply(max_id).apply(pd.Series)
|
| 892 |
|
| 893 |
max_id.cache_clear()
|
| 894 |
|
|
|
|
| 904 |
prediction_df[[
|
| 905 |
'Max. Tanimoto Similarity to Known Ligands of Identified Target',
|
| 906 |
'Max. Sim. Ligand'
|
| 907 |
+
]] = prediction_df['X2'].parallel_apply(max_sim).apply(pd.Series)
|
| 908 |
|
| 909 |
max_sim.cache_clear()
|
| 910 |
|
|
|
|
| 949 |
orig_df['Target Family'] = None
|
| 950 |
if orig_df['Target Family'].isna().any():
|
| 951 |
orig_df.loc[orig_df['Target Family'].isna(), 'Target Family'] = (
|
| 952 |
+
orig_df.loc[orig_df['Target Family'].isna(), 'X2'].parallel_apply(detect_family)
|
| 953 |
)
|
| 954 |
orig_df['Target Family'] = orig_df['Target Family'].str.capitalize()
|
| 955 |
detect_family.cache_clear()
|
| 956 |
|
| 957 |
+
orig_df['X1^'] = orig_df['X1'].parallel_apply(rdkit_canonicalize)
|
| 958 |
|
| 959 |
orig_df = orig_df.merge(df_training[['X1^', 'X2', 'Y']], on=['X1^', 'X2'], how='left', indicator=False)
|
| 960 |
annotated_df = orig_df[~orig_df['Y'].isna()].copy()
|
|
|
|
| 1109 |
|
| 1110 |
if 'X1' in df.columns:
|
| 1111 |
if 'Compound' not in df.columns or df['Compound'].dtype != 'object':
|
| 1112 |
+
df['Compound'] = df['X1'].parallel_apply(
|
| 1113 |
lambda smiles: PandasTools._MolPlusFingerprint(Chem.MolFromSmiles(smiles)))
|
| 1114 |
+
df['Scaffold'] = df['Compound'].parallel_apply(MurckoScaffold.GetScaffoldForMol)
|
| 1115 |
+
df['Scaffold SMILES'] = df['Scaffold'].parallel_apply(lambda x: Chem.MolToSmiles(x))
|
| 1116 |
df['Pharmacophore'] = None
|
| 1117 |
if task == 'Compound-Protein Binding Affinity':
|
| 1118 |
# Convert Y^ from pIC50 (nM) to IC50 (nM)
|
|
|
|
| 1182 |
columns_unique = None
|
| 1183 |
|
| 1184 |
if 'Exclude Pharmacophore 3D' not in opts:
|
| 1185 |
+
df_html['Pharmacophore'] = df_html['Compound'].parallel_apply(
|
| 1186 |
lambda x: mol_to_pharm3d(x) if not pd.isna(x) else x)
|
| 1187 |
|
| 1188 |
if 'Compound' in df_html.columns and 'Exclude Molecular Graph' not in opts:
|
| 1189 |
+
df_html['Compound'] = df_html['Compound'].parallel_apply(
|
| 1190 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
| 1191 |
else:
|
| 1192 |
df_html.drop(['Compound'], axis=1, inplace=True)
|
| 1193 |
|
| 1194 |
if 'Scaffold' in df_html.columns and 'Exclude Scaffold Graph' not in opts:
|
| 1195 |
+
df_html['Scaffold'] = df_html['Scaffold'].parallel_apply(
|
| 1196 |
lambda x: PandasTools.PrintAsImageString(x) if not pd.isna(x) else x)
|
| 1197 |
else:
|
| 1198 |
df_html.drop(['Scaffold'], axis=1, inplace=True)
|
|
|
|
| 1227 |
df_html.rename(columns=column_aliases, inplace=True)
|
| 1228 |
df_html.index.name = 'Index'
|
| 1229 |
if 'Target FASTA' in df_html.columns:
|
| 1230 |
+
df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
|
| 1231 |
lambda x: wrap_text(x) if not pd.isna(x) else x)
|
| 1232 |
|
| 1233 |
num_cols = df_html.select_dtypes('number').columns
|
|
|
|
| 1247 |
if 'Target ID' in df_html.columns:
|
| 1248 |
df_html.drop(['Target FASTA'], axis=1, inplace=True)
|
| 1249 |
if 'Target FASTA' in df_html.columns:
|
| 1250 |
+
df_html['Target FASTA'] = df_html['Target FASTA'].parallel_apply(
|
| 1251 |
lambda x: wrap_text(x) if not pd.isna(x) else x)
|
| 1252 |
if 'Scaffold SMILES' in df_html.columns:
|
| 1253 |
df_html.drop(['Scaffold SMILES'], axis=1, inplace=True)
|
|
|
|
| 1555 |
df_report = df.copy()
|
| 1556 |
try:
|
| 1557 |
for filter_name in filter_list:
|
| 1558 |
+
df_report[filter_name] = df_report['Compound'].parallel_apply(
|
| 1559 |
lambda x: FILTER_MAP[filter_name](x) if not pd.isna(x) else x)
|
| 1560 |
|
| 1561 |
for score_name in score_list:
|
| 1562 |
+
df_report[score_name] = df_report['Compound'].parallel_apply(
|
| 1563 |
lambda x: SCORE_MAP[score_name](x) if not pd.isna(x) else x)
|
| 1564 |
|
| 1565 |
if opt_list:
|
|
|
|
| 2263 |
alignment = aligner.align(processed_fasta, query)
|
| 2264 |
return alignment.score / max(len(processed_fasta), len(query))
|
| 2265 |
|
| 2266 |
+
alignment_df['score'] = alignment_df['X2'].parallel_apply(align_score)
|
| 2267 |
row = alignment_df.loc[alignment_df['score'].idxmax()]
|
| 2268 |
family = str(row['Target Family']).title()
|
| 2269 |
return gr.Dropdown(value=family,
|
|
|
|
| 2595 |
infer_df = pd.read_csv(drug_target_pair_upload)
|
| 2596 |
validate_columns(infer_df, ['X1', 'X2'])
|
| 2597 |
|
| 2598 |
+
infer_df['X1_ERR'] = infer_df['X1'].parallel_apply(
|
| 2599 |
validate_seq_str, regex=SMILES_PAT)
|
| 2600 |
if not infer_df['X1_ERR'].isna().all():
|
| 2601 |
raise ValueError(
|
| 2602 |
f"Encountered invalid SMILES:\n{infer_df[~infer_df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
|
| 2603 |
|
| 2604 |
+
infer_df['X2_ERR'] = infer_df['X2'].parallel_apply(
|
| 2605 |
validate_seq_str, regex=FASTA_PAT)
|
| 2606 |
if not infer_df['X2_ERR'].isna().all():
|
| 2607 |
raise ValueError(
|