Spaces:

libokj
/

DeepSEQreen_NAR_fb

Sleeping

App Files Files Community

libokj commited on Dec 20, 2023

Commit

52318e2

1 Parent(s): 0065d8f

Upload 174 files

Browse files

Files changed (35) hide show

.gitattributes +2 -0
app.py +978 -210
data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv +0 -0
deepscreen/__init__.py +2 -2
deepscreen/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/__pycache__/train.cpython-311.pyc +0 -0
deepscreen/data/__pycache__/dti.cpython-311.pyc +0 -0
deepscreen/data/dti.py +67 -23
deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/graph.cpython-311.pyc +0 -0
deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc +0 -0
deepscreen/data/featurizers/categorical.py +15 -15
deepscreen/data/featurizers/monn.py +1 -1
deepscreen/data/featurizers/token.py +18 -14
deepscreen/data/utils/__pycache__/collator.cpython-311.pyc +0 -0
deepscreen/data/utils/__pycache__/label.cpython-311.pyc +0 -0
deepscreen/data/utils/__pycache__/split.cpython-311.pyc +0 -0
deepscreen/data/utils/collator.py +94 -43
deepscreen/data/utils/label.py +1 -0
deepscreen/gui/test.py +114 -0
deepscreen/models/__pycache__/dti.cpython-311.pyc +0 -0
deepscreen/models/dti.py +1 -1
deepscreen/models/loss/__pycache__/multitask_loss.cpython-311.pyc +0 -0
deepscreen/models/metrics/bedroc.py +3 -0
deepscreen/models/metrics/ci.py +39 -0
deepscreen/models/metrics/ef.py +4 -1
deepscreen/models/metrics/hit_rate.py +3 -0
deepscreen/models/metrics/rie.py +9 -6
deepscreen/models/predictors/drug_vqa.py +4 -1
deepscreen/models/predictors/transformer_cpi.py +26 -66
deepscreen/models/predictors/transformer_cpi_2.py +2 -3
deepscreen/utils/__pycache__/hydra.cpython-311.pyc +0 -0
deepscreen/utils/hydra.py +46 -36
resources/vocabs/drug_vqa/combinedVoc-wholeFour.voc +0 -1

.gitattributes CHANGED Viewed

@@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+data/drug_libraries/drugbank_human_py_annot.csv filter=lfs diff=lfs merge=lfs -text
+resources/checkpoints/deep_dta-binary-general.ckpt.bak filter=lfs diff=lfs merge=lfs -text

app.py CHANGED Viewed

@@ -1,53 +1,207 @@
-import hydra
 import os
 import pathlib
 from pathlib import Path
 import sys
 import gradio as gr
 import pandas as pd
 from rdkit import Chem
-from rdkit.Chem import RDConfig, Descriptors, Lipinski, Crippen
 from deepscreen.predict import predict
 sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
 import sascorer
 ROOT = Path.cwd()
-# TODO refactor caching with LRU
-# MOL_MAP = {}
-# def cached_mol(smiles):
-#     if smiles not in MOL_MAP:
-#         MOL_MAP.update({smiles: Chem.MolFromSmiles(smiles)})
-#     return MOL_MAP.get(smiles)
 def sa_score(row):
-    return sascorer.calculateScore(Chem.MolFromSmiles(row['X1']))
 def mw(row):
-    return Chem.Descriptors.MolWt(Chem.MolFromSmiles(row['X1']))
 def hbd(row):
-    return Lipinski.NumHDonors(Chem.MolFromSmiles(row['X1']))
 def hba(row):
-    return Lipinski.NumHAcceptors(Chem.MolFromSmiles(row['X1']))
 def logp(row):
-    return Crippen.MolLogP(Chem.MolFromSmiles(row['X1']))
 SCORE_MAP = {
     'SAscore': sa_score,
-    'RAscore': None, # https://github.com/reymond-group/RAscore
-    'SCScore': None, # https://pubs.acs.org/doi/10.1021/acs.jcim.7b00622
-    'LogP': logp, # https://www.rdkit.org/docs/source/rdkit.Chem.Crippen.html
-    'MW': mw, # https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors.html
-    'HBD': hbd, # https://www.rdkit.org/docs/source/rdkit.Chem.Lipinski.html
-    'HBA': hba, # https://www.rdkit.org/docs/source/rdkit.Chem.Lipinski.html
-    'TopoPSA': None, # http://mordred-descriptor.github.io/documentation/master/api/mordred.TopoPSA.html
 }
 FILTER_MAP = {
@@ -64,36 +218,36 @@ TASK_MAP = {
 PRESET_MAP = {
     'DeepDTA': 'deep_dta',
-    'GraphDTA': 'graph_dta'
 }
 TARGET_FAMILY_MAP = {
-    'Auto-detect': 'detect',
-    'Manually-labelled': 'labelled',
-    'Library-labelled': 'labelled',
-    'Kinases': 'kinases',
-    'Non-kinase enzymes': 'non-kinase_enzymes',
-    'Membrane receptors': 'membrane_receptors',
-    'Nuclear receptors': 'nuclear_receptors',
-    'Ion channels': 'ion_channels',
     'Other protein targets': 'other_protein_targets',
-    'Kinases (auto-detected)': 'kinases',
-    'Non-kinase enzymes (auto-detected)': 'non-kinase_enzymes',
-    'Membrane receptors (auto-detected)': 'membrane_receptors',
-    'Nuclear receptors (auto-detected)': 'nuclear_receptors',
-    'Ion channels (auto-detected)': 'ion_channels',
-    'Other protein targets (auto-detected)': 'other_protein_targets',
-    'Indiscriminate': 'indiscriminate'
 }
 TARGET_LIBRARY_MAP = {
-    'STITCH': 'stitch.csv',
-    'Drug Repurposing Hub': 'drug_repurposing_hub.csv',
 }
 DRUG_LIBRARY_MAP = {
-    'ChEMBL': 'chembl.csv',
-    'DrugBank': 'drug_bank.csv',
 }
 MODE_LIST = [
@@ -102,182 +256,796 @@ MODE_LIST = [
     'Drug-target pair'
 ]
-def predictions_to_df(predictions):
-    predictions = [pd.DataFrame(prediction) for prediction in predictions]
-    prediction_df = pd.concat(predictions, ignore_index=True)
-    return prediction_df
-def submit_predict(predict_data, task, preset, target_family):
-    task = TASK_MAP[task]
-    preset = PRESET_MAP[preset]
-    target_family = TARGET_FAMILY_MAP[target_family]
-    match target_family:
-        case 'labelled':
-            pass  # target_family_list = ...
-        case 'detect':
-            pass  # target_family_list = ...
-        case _:
-            target_family_list = [target_family]
-    prediction_df = pd.DataFrame()
-    for target_family in target_family_list:
         with hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference"):
             cfg = hydra.compose(
                 config_name="webserver_inference",
-                overrides=[
-                    f"task={task}",
-                    f"preset={preset}",
-                    f"ckpt_path=resources/checkpoints/{preset}-{task}-{target_family}.ckpt",
-                    f"data.data_file='{str(predict_data)}'",
-                ]
-            )
-        predictions, _ = predict(cfg)
-        prediction_df = pd.concat([prediction_df, predictions_to_df(predictions)])
-    return [gr.DataFrame(value=prediction_df, visible=True), gr.Tabs(selected=1)]
-# Define a function that takes a CSV output and a list of analytical utility functions as inputs
-def submit_report(df, score_list, filter_list):
-    # Loop through the list of functions and apply them to the dataframe
-    for filter_name in filter_list:
-        gr.Info(f'Applying {filter_name}...')
-    for score_name in score_list:
-        gr.Info(f'Calculating {score_name}...')
-        # Apply the function to the dataframe and assign the result to a new column
-        df[score_name] = df.apply(SCORE_MAP[score_name], axis=1)
-    # Return the dataframe as a table
-    return [gr.DataFrame(visible=False), gr.DataFrame(value=df, visible=True)]
-def change_layout(mode):
-    match mode:
-        case "Drug screening":
-            return [
-                gr.Row(visible=True),
-                gr.Row(visible=False),
-                gr.Row(visible=False),
-                gr.Dropdown(choices=[
-                    'Auto-detect',
-                    'Kinases',
-                    'Non-kinase enzymes',
-                    'Membrane receptors',
-                    'Nuclear receptors',
-                    'Ion channels',
-                    'Other protein targets',
-                    'Indiscriminate'
-                ])
-            ]
-        case "Drug repurposing":
-            return [
-                gr.Row(visible=False),
-                gr.Row(visible=True),
-                gr.Row(visible=False),
-                gr.Dropdown(choices=[
-                    'Library-labelled',
-                    'Indiscriminate'
-                ])
-            ]
-        case "Drug-target pair":
-            return [
-                gr.Row(visible=False),
-                gr.Row(visible=False),
-                gr.Row(visible=True),
-                gr.Dropdown(choices=[
-                    'Auto-detect',
-                    'Manually-labelled',
-                    'Indiscriminate'
-                ])
-            ]
-with gr.Blocks(theme=gr.themes.Soft(spacing_size="sm", text_size='md'), title='DeepScreen') as demo:
     with gr.Tabs() as tabs:
-        with gr.TabItem(label='Inference', id=0) as inference:
             gr.Markdown('''
-            # <center>DeepScreen Inference Service</center>
-            DeepScreen for predicting drug-target interaction/binding affinity.
-            ''')
-            mode = gr.Radio(label='Mode', choices=MODE_LIST, value='Drug screening')
-            with gr.Row(visible=True) as drug_screening:
-                with gr.Column():
-                    target = gr.Textbox(label='Target FASTA sequence')
-                    drug_library = gr.Dropdown(label='Drug library', choices=DRUG_LIBRARY_MAP.keys())
-                    # Modify the pd df directly with df['X2'] = target
-            with gr.Row(visible=False) as drug_repurposing:
-                with gr.Column():
-                    drug = gr.Textbox(label='Drug SMILES sequence')
-                    target_library = gr.Dropdown(label='Target library', choices=TARGET_LIBRARY_MAP.keys())
-                    # Modify the pd df directly with df['X1'] = drug
-            with gr.Row(visible=False) as drug_target_pair:
-                predict_data = gr.File(label='Prediction dataset file', file_count="single", type='filepath', height=50)
-            with gr.Row(visible=True):
-                task = gr.Dropdown(list(TASK_MAP.keys()), label='Task')
-                preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
-                target_family = gr.Dropdown(choices=[
-                    'Auto-detect',
-                    'Kinases',
-                    'Non-kinase enzymes',
-                    'Membrane receptors',
-                    'Nuclear receptors',
-                    'Ion channels',
-                    'Other protein targets',
-                    'Indiscriminate'
-                ], label='Target family')
-            with gr.Row(visible=True):
-                predict_btn = gr.Button("Predict", variant="primary")
-        with gr.TabItem(label='Report', id=1) as report:
             gr.Markdown('''
-                # <center>DeepScreen Virtual Screening Report</center>
-                Analytic report for virtual screening predictions.
-                ''')
-            with gr.Row():
-                scores = gr.CheckboxGroup(SCORE_MAP.keys(), label='Scores')
-                filters = gr.CheckboxGroup(FILTER_MAP.keys(), label='Filters')
-            with gr.Row():
-                df_original = gr.Dataframe(type="pandas", interactive=False, height=500, visible=False)
-                df_report = gr.Dataframe(type="pandas", interactive=False, height=500, visible=False)
-            with gr.Row():
-                clear_btn = gr.ClearButton()
-                analyze_btn = gr.Button("Report", variant="primary")
-    mode.change(change_layout, mode, [drug_screening, drug_repurposing, drug_target_pair, target_family], show_progress=False)
-    predict_btn.click(fn=submit_predict, inputs=[predict_data, task, preset, target_family], outputs=[df_original, tabs])
-    analyze_btn.click(fn=submit_report, inputs=[df_original, scores, filters], outputs=[df_original, df_report])
-    # js = """function () {
-    #   gradioURL = window.location.href
-    #   if (!gradioURL.endsWith('?__theme=light')) {
-    #     window.location.replace(gradioURL + '?__theme=light');
-    #   }
-    # }"""
-    js="""
-        () => {
-            document.body.classList.remove('dark');
-            document.querySelector('gradio-app').style.backgroundColor = 'var(--color-background-primary)'
-        }
-        """
-    demo.load(None, None, None, js=js)
-demo.close()
-demo.launch(debug=True)

+import hashlib
+import json
+import textwrap
+import threading
+from math import pi
+from uuid import uuid4
+import io
 import os
 import pathlib
 from pathlib import Path
 import sys
+from Bio import AlignIO, SeqIO
+# from email_validator import validate_email
 import gradio as gr
+import hydra
 import pandas as pd
+import plotly.express as px
+import requests
+from requests.adapters import HTTPAdapter, Retry
 from rdkit import Chem
+from rdkit.Chem import RDConfig, Descriptors, Draw, Lipinski, Crippen, PandasTools
+from rdkit.Chem.Scaffolds import MurckoScaffold
+import seaborn as sns
+import swifter
+from tqdm.auto import tqdm
+from deepscreen.data.dti import rdkit_canonicalize, validate_seq_str, FASTA_PAT, SMILES_PAT
 from deepscreen.predict import predict
 sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
 import sascorer
 ROOT = Path.cwd()
+DATA_PATH = Path("./")  # Path("/data")
+DF_FOR_REPORT = pd.DataFrame()
+pd.set_option('display.float_format', '{:.3f}'.format)
+PandasTools.molRepresentation = 'svg'
+PandasTools.drawOptions = Draw.rdMolDraw2D.MolDrawOptions()
+PandasTools.drawOptions.clearBackground = False
+PandasTools.drawOptions.bondLineWidth = 1.5
+PandasTools.drawOptions.explicitMethyl = True
+PandasTools.drawOptions.singleColourWedgeBonds = True
+PandasTools.drawOptions.useCDKAtomPalette()
+PandasTools.molSize = (128, 128)
+SESSION = requests.Session()
+ADAPTER = HTTPAdapter(max_retries=Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]))
+SESSION.mount('http://', ADAPTER)
+SESSION.mount('https://', ADAPTER)
+# SCHEDULER = BackgroundScheduler()
+UNIPROT_ENDPOINT = 'https://rest.uniprot.org/uniprotkb/{query}'
+CSS = """
+.help-tip {
+  position: absolute;
+  display: block;
+  top: 0px;
+  right: 0px;
+  text-align: center;
+  background-color: #29b6f6;
+  border-radius: 50%;
+  width: 24px;
+  height: 24px;
+  font-size: 12px;
+  line-height: 26px;
+  cursor: default;
+  transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
+}
+.help-tip:hover {
+  cursor: pointer;
+  background-color: #ccc;
+}
+.help-tip:before {
+  content: '?';
+  font-weight: 700;
+  color: #fff;
+  z-index: 100;
+}
+.help-tip p {
+  visibility: hidden;
+  opacity: 0;
+  text-align: left;
+  background-color: #039be5;
+  padding: 20px;
+  width: 300px;
+  position: absolute;
+  border-radius: 4px;
+  right: -4px;
+  color: #fff;
+  font-size: 13px;
+  line-height: normal;
+  transform: scale(0.7);
+  transform-origin: 100% 0%;
+  transition: all 0.5s cubic-bezier(0.55, 0, 0.1, 1);
+  z-index: 100;
+}
+.help-tip:hover p {
+  cursor: default;
+  visibility: visible;
+  opacity: 1;
+  transform: scale(1.0);
+}
+.help-tip p:before {
+  position: absolute;
+  content: '';
+  width: 0;
+  height: 0;
+  border: 6px solid transparent;
+  border-bottom-color: #039be5;
+  right: 10px;
+  top: -12px;
+}
+.help-tip p:after {
+  width: 100%;
+  height: 40px;
+  content: '';
+  position: absolute;
+  top: -5px;
+  left: 0;
+}
+.help-tip a {
+  color: #fff;
+  font-weight: 700;
+}
+.help-tip a:hover, .help-tip a:focus {
+  color: #fff;
+  text-decoration: underline;
+}
+.upload_button {
+  background-color: #008000;
+}
+.absolute {
+  position: absolute;
+}
+#example {
+padding: 0;
+background: none;
+border: none;
+text-decoration: underline;
+box-shadow: none;
+text-align: left !important;
+display: inline-block !important;
+}
+footer {
+visibility: hidden
+}
+"""
+class HelpTip:
+    def __new__(cls, text):
+        return gr.HTML(elem_classes="help-tip",
+                       value=f'<p>{text}</p>'
+                       )
 def sa_score(row):
+    return sascorer.calculateScore((row['Compound']))
 def mw(row):
+    return Chem.Descriptors.MolWt((row['Compound']))
 def hbd(row):
+    return Lipinski.NumHDonors((row['Compound']))
 def hba(row):
+    return Lipinski.NumHAcceptors((row['Compound']))
 def logp(row):
+    return Crippen.MolLogP((row['Compound']))
 SCORE_MAP = {
     'SAscore': sa_score,
+    'RAscore': None,  # https://github.com/reymond-group/RAscore
+    'SCScore': None,  # https://pubs.acs.org/doi/10.1021/acs.jcim.7b00622
+    'LogP': logp,  # https://www.rdkit.org/docs/source/rdkit.Chem.Crippen.html
+    'MW': mw,  # https://www.rdkit.org/docs/source/rdkit.Chem.Descriptors.html
+    'HBD': hbd,  # https://www.rdkit.org/docs/source/rdkit.Chem.Lipinski.html
+    'HBA': hba,  # https://www.rdkit.org/docs/source/rdkit.Chem.Lipinski.html
+    'TopoPSA': None,  # http://mordred-descriptor.github.io/documentation/master/api/mordred.TopoPSA.html
 }
 FILTER_MAP = {
 PRESET_MAP = {
     'DeepDTA': 'deep_dta',
+    'DeepConvDTI': 'deep_conv_dti',
+    'GraphDTA': 'graph_dta',
+    'MGraphDTA': 'm_graph_dta',
+    'HyperAttentionDTI': 'hyper_attention_dti',
+    'MolTrans': 'mol_trans',
+    'TransformerCPI': 'transfomer_cpi',
+    'TransformerCPI2': 'transformer_cpi_2',
+    'DrugBAN': 'drug_ban',
+    'DrugVQA(Seq)': 'drug_vqa'
 }
 TARGET_FAMILY_MAP = {
+    'General': 'general',
+    'Kinase': 'kinases',
+    'Non-kinase enzyme': 'non-kinase_enzymes',
+    'Membrane receptor': 'membrane_receptors',
+    'Nuclear receptor': 'nuclear_receptors',
+    'Ion channel': 'ion_channels',
     'Other protein targets': 'other_protein_targets',
 }
 TARGET_LIBRARY_MAP = {
+    # 'STITCH': 'stitch.csv',
+    'ChEMBL33 (all species)': 'ChEMBL33_all_spe_single_prot_info.csv',
+    'DrugBank (Human)': 'drugbank_human_py_annot.csv',
 }
 DRUG_LIBRARY_MAP = {
+    # 'ChEMBL': 'chembl.csv',
+    'DrugBank (Human)': 'drugbank_human_py_annot.csv',
 }
 MODE_LIST = [
     'Drug-target pair'
 ]
+COLUMN_ALIASES = {
+    'X1': 'Drug SMILES',
+    'X2': 'Target FASTA',
+    'ID1': 'Drug ID',
+    'ID2': 'Target ID',
+}
+URL = "https://ciddr-lab.ac.cn/deepseqreen"
+def validate_columns(df, mandatory_cols):
+    missing_cols = [col for col in mandatory_cols if col not in df.columns]
+    if missing_cols:
+        error_message = (f"The following mandatory columns are missing "
+                         f"in the uploaded dataset: {str(['X1', 'X2']).strip('[]')}.")
+        raise gr.Error(error_message)
+def send_email(receiver, msg):
+    pass
+def submit_predict(predict_filepath, task, preset, target_family, flag, progress=gr.Progress(track_tqdm=True)):
+    if flag:
+        job_id = flag
+        global COLUMN_ALIASES
+        task = TASK_MAP[task]
+        preset = PRESET_MAP[preset]
+        target_family = TARGET_FAMILY_MAP[target_family]
+        # email_hash = hashlib.sha256(email.encode()).hexdigest()
+        COLUMN_ALIASES = COLUMN_ALIASES | {
+            'Y': 'Actual interaction' if task == 'binary' else 'Actual affinity',
+            'Y^': 'Predicted interaction' if task == 'binary' else 'Predicted affinity'
+        }
+        # target_family_list = [target_family]
+        # for family in target_family_list:
+        # try:
+        prediction_df = pd.DataFrame()
         with hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference"):
             cfg = hydra.compose(
                 config_name="webserver_inference",
+                overrides=[f"task={task}",
+                           f"preset={preset}",
+                           f"ckpt_path=resources/checkpoints/{preset}-{task}-{target_family}.ckpt",
+                           f"data.data_file='{str(predict_filepath)}'"])
+            predictions, _ = predict(cfg)
+            predictions = [pd.DataFrame(prediction) for prediction in predictions]
+            prediction_df = pd.concat([prediction_df, pd.concat(predictions, ignore_index=True)])
+            predictions_file = f'{job_id}_predictions.csv'
+            prediction_df.to_csv(predictions_file)
+            return [gr.Markdown(visible=True),
+                    gr.File(predictions_file),
+                    gr.State(False)]
+        #
+        # except Exception as e:
+        #     raise gr.Error(str(e))
+    # email_lock = Path(f"outputs/{email_hash}.lock")
+    # with open(email_lock, "w") as file:
+    #     record = {
+    #         "email": email,
+    #         "job_id": job_id
+    #     }
+    #     json.dump(record, file)
+    # def run_predict():
+    # TODO per-user submit usage
+    #     # email_lock = Path(f"outputs/{email_hash}.lock")
+    #     # with open(email_lock, "w") as file:
+    #     #     record = {
+    #     #         "email": email,
+    #     #         "job_id": job_id
+    #     #     }
+    #     #     json.dump(record, file)
+    #
+    #     job_lock = DATA_PATH / f"outputs/{job_id}.lock"
+    #     with open(job_lock, "w") as file:
+    #         pass
+    #
+    #     try:
+    #         prediction_df = pd.DataFrame()
+    #         for family in target_family_list:
+    #             with hydra.initialize(version_base="1.3", config_path="configs", job_name="webserver_inference"):
+    #                 cfg = hydra.compose(
+    #                     config_name="webserver_inference",
+    #                     overrides=[f"task={task}",
+    #                                f"preset={preset}",
+    #                                f"ckpt_path=resources/checkpoints/{preset}-{task}-{family}.ckpt",
+    #                                f"data.data_file='{str(predict_dataset)}'"])
+    #
+    #             predictions, _ = predict(cfg)
+    #             predictions = [pd.DataFrame(prediction) for prediction in predictions]
+    #             prediction_df = pd.concat([prediction_df, pd.concat(predictions, ignore_index=True)])
+    #         prediction_df.to_csv(f'outputs/{job_id}.csv')
+    #         # email_lock.unlink()
+    #         job_lock.unlink()
+    #
+    #         msg = (f'Your DeepSEQcreen prediction job (id: {job_id}) completed successfully. You may retrieve the '
+    #                f'results and generate an analytical report at {URL} using the job id within 48 hours.')
+    #         gr.Info(msg)
+    #     except Exception as e:
+    #         msg = (f'Your DeepSEQcreen prediction job (id: {job_id}) failed due to an error: "{str(e)}." You may '
+    #                f'reach out to the author about the error through email (DeepSEQreen@xjtlu.edu.cn).')
+    #         raise gr.Error(str(e))
+    #     finally:
+    #         send_email(email, msg)
+    #
+    # # Run "predict" asynchronously
+    # threading.Thread(target=run_predict).start()
+    #
+    # msg = (f'Your DeepSEQcreen prediction job (id: {job_id}) started running. You may retrieve the results '
+    #        f'and generate an analytical report at {URL} using the job id once the job is done. Only one job '
+    #        f'per user is allowed at the same time.')
+    # send_email(email, msg)
+    # # Return the job id first
+    # return [
+    #     gr.Blocks(visible=False),
+    #     gr.Markdown(f"Your prediction job is running... "
+    #                 f"You may stay on this page or come back later to retrieve the results "
+    #                 f"Once you receive our email notification."),
+    # ]
+def update_df(file, progress=gr.Progress(track_tqdm=True)):
+    global DF_FOR_REPORT
+    if file is not None:
+        df = pd.read_csv(file)
+        if df['X1'].nunique() > 1:
+            df['Scaffold SMILES'] = df['X1'].swifter.progress_bar(
+                desc=f"Calculating scaffold...").apply(MurckoScaffold.MurckoScaffoldSmilesFromSmiles)
+            # Add a new column with RDKit molecule objects
+            PandasTools.AddMoleculeColumnToFrame(df, smilesCol='X1', molCol='Compound',
+                                                 includeFingerprints=False)
+            PandasTools.AddMoleculeColumnToFrame(df, smilesCol='Scaffold SMILES', molCol='Scaffold',
+                                                 includeFingerprints=False)
+        DF_FOR_REPORT = df.copy()
+        pie_chart = None
+        value = None
+        if 'Y^' in DF_FOR_REPORT.columns:
+            value = 'Y^'
+        elif 'Y' in DF_FOR_REPORT.columns:
+            value = 'Y'
+        if value:
+            if DF_FOR_REPORT['X1'].nunique() > 1 >= DF_FOR_REPORT['X2'].nunique():
+                pie_chart = create_pie_chart(DF_FOR_REPORT, category='Scaffold SMILES', value=value, top_k=100)
+            elif DF_FOR_REPORT['X2'].nunique() > 1 >= DF_FOR_REPORT['X1'].nunique():
+                pie_chart = create_pie_chart(DF_FOR_REPORT, category='Target family', value=value, top_k=100)
+        return create_html_report(DF_FOR_REPORT), pie_chart
+    else:
+        return gr.HTML(''), gr.Plot()
+def create_html_report(df, progress=gr.Progress(track_tqdm=True)):
+    cols_left = ['ID2', 'Y', 'Y^', 'ID1', 'Compound', 'Scaffold', 'Scaffold SMILES', ]
+    cols_right = ['X1', 'X2']
+    cols_left = [col for col in cols_left if col in df.columns]
+    cols_right = [col for col in cols_right if col in df.columns]
+    df = df[cols_left + (df.columns.drop(cols_left + cols_right).tolist()) + cols_right]
+    df['X2'] = df['X2'].apply(wrap_text)
+    df.rename(COLUMN_ALIASES, inplace=True)
+    styled_df = df.style
+    # styled_df = df.style.format("{:.2f}")
+    colors = sns.color_palette('husl', len(df.columns))
+    for i, col in enumerate(df.columns):
+        if pd.api.types.is_numeric_dtype(df[col]):
+            styled_df = styled_df.background_gradient(subset=col, cmap=sns.light_palette(colors[i], as_cmap=True))
+    # Return the DataFrame as HTML
+    PandasTools.RenderImagesInAllDataFrames(images=True)
+    html = df.to_html()
+    return f'<div style="overflow:auto; height: 500px;">{html}</div>'
+    # return gr.HTML(pn.widgets.Tabulator(df).embed())
+# def create_pie_chart(df, category, value, top_k):
+#     df.rename(COLUMN_ALIASES, inplace=True)
+#     # Select the top_k records based on the value_col
+#     top_k_df = df.nlargest(top_k, value)
+#
+#     # Count the frequency of each unique value in the category_col column
+#     category_counts = top_k_df[category].value_counts()
+#
+#     # Convert the counts to a DataFrame
+#     data = pd.DataFrame({category: category_counts.index, 'value': category_counts.values})
+#
+#     # Calculate the angle for each category
+#     data['angle'] = data['value']/data['value'].sum() * 2*pi
+#
+#     # Assign colors
+#     data['color'] = Spectral11[0:len(category_counts)]
+#
+#     # Create the plot
+#     p = figure(height=350, title="Pie Chart", toolbar_location=None,
+#                tools="hover", tooltips="@{}: @value".format(category), x_range=(-0.5, 1.0))
+#
+#     p.wedge(x=0, y=1, radius=0.4,
+#             start_angle=cumsum('angle', include_zero=True), end_angle=cumsum('angle'),
+#             line_color="white", fill_color='color', legend_field=category, source=data)
+#
+#     p.axis.axis_label = None
+#     p.axis.visible = False
+#     p.grid.grid_line_color = None
+#
+#     return p
+def create_pie_chart(df, category, value, top_k):
+    df = df.copy()
+    df.rename(COLUMN_ALIASES, inplace=True)
+    value = COLUMN_ALIASES.get(value, value)
+    # Select the top_k records based on the value_col
+    top_k_df = df.nlargest(top_k, value)
+    # Count the frequency of each unique value in the category_col column
+    category_counts = top_k_df[category].value_counts()
+    # Convert the counts to a DataFrame
+    data = pd.DataFrame({category: category_counts.index, 'value': category_counts.values})
+    # Create the plot
+    fig = px.pie(data, values='value', names=category, title=f'Top-{top_k} {category} in {value}')
+    fig.update_traces(textposition='inside', textinfo='percent+label')
+    return fig
+def submit_report(score_list, filter_list, progress=gr.Progress(track_tqdm=True)):
+    df = DF_FOR_REPORT.copy()
+    try:
+        for filter_name in filter_list:
+            pass
+        for score_name in score_list:
+            df[score_name] = df.swifter.progress_bar(desc=f"Calculating {score_name}").apply(
+                SCORE_MAP[score_name], axis=1)
+        pie_chart = None
+        value = None
+        if 'Y^' in df.columns:
+            value = 'Y^'
+        elif 'Y' in df.columns:
+            value = 'Y'
+        if value:
+            if df['X1'].nunique() > 1 >= df['X2'].nunique():
+                pie_chart = create_pie_chart(df, category='Scaffold SMILES', value=value, top_k=100)
+            elif df['X2'].nunique() > 1 >= df['X1'].nunique():
+                pie_chart = create_pie_chart(df, category='Target famiy', value=value, top_k=100)
+        return create_html_report(df), pie_chart
+    except Exception as e:
+        raise gr.Error(str(e))
+def check_job_status(job_id):
+    job_lock = DATA_PATH / f"{job_id}.lock"
+    job_file = DATA_PATH / f"{job_id}.csv"
+    if job_lock.is_file():
+        return {gr.Markdown(f"Your job ({job_id}) is still running... "
+                            f"You may stay on this page or come back later to retrieve the results "
+                            f"Once you receive our email notification."),
+                None,
+                None
+                }
+    elif job_file.is_file():
+        return {gr.Markdown(f"Your job ({job_id}) is done! Redirecting you to generate reports..."),
+                gr.Tabs(selected=3),
+                gr.File(str(job_lock))}
+def wrap_text(text, line_length=60):
+    wrapper = textwrap.TextWrapper(width=line_length)
+    if text.startswith('>'):
+        sections = text.split('>')
+        wrapped_sections = []
+        for section in sections:
+            if not section:
+                continue
+            lines = section.split('\n')
+            seq_header = lines[0]
+            wrapped_seq = wrapper.fill(''.join(lines[1:]))
+            wrapped_sections.append(f">{seq_header}\n{wrapped_seq}")
+        return '\n'.join(wrapped_sections)
+    else:
+        return wrapper.fill(text)
+def unwrap_text(text):
+    return text.strip.replece('\n', '')
+def smiles_from_sdf(sdf_path):
+    with Chem.SDMolSupplier(sdf_path) as suppl:
+        return Chem.MolToSmiles(suppl[0])
+theme = gr.themes.Base(spacing_size="sm", text_size='md').set(
+    background_fill_primary='#dfe6f0',
+    background_fill_secondary='#dfe6f0',
+    checkbox_label_background_fill='#dfe6f0',
+    checkbox_label_background_fill_hover='#dfe6f0',
+    checkbox_background_color='white',
+    checkbox_border_color='#4372c4',
+    border_color_primary='#4372c4',
+    border_color_accent='#4372c4',
+    button_primary_background_fill='#4372c4',
+    button_primary_text_color='white',
+    button_secondary_border_color='#4372c4',
+    body_text_color='#4372c4',
+    block_title_text_color='#4372c4',
+    block_label_text_color='#4372c4',
+    block_info_text_color='#505358',
+    block_border_color=None,
+    input_border_color='#4372c4',
+    panel_border_color='#4372c4',
+    input_background_fill='white',
+    code_background_fill='white',
+)
+with (gr.Blocks(theme=theme, title='DeepScreen', css=CSS) as demo):
+    run_state = gr.State(value=False)
+    screen_flag = gr.State(value=False)
+    identify_flag = gr.State(value=False)
+    infer_flag = gr.State(value=False)
     with gr.Tabs() as tabs:
+        with gr.TabItem(label='Drug hit screening', id=0):
             gr.Markdown('''
+                    # <center>DeepSEQreen Drug Hit Screening</center>
+                    <center>
+                    To predict interactions/binding affinities of a single target against a library of drugs.
+                    </center>
+                    ''')
+            with gr.Blocks() as screen_block:
+                with gr.Column() as screen_page:
+                    with gr.Row():
+                        with gr.Column(scale=4, variant='panel'):
+                            target_fasta = gr.Code(label='Target sequence FASTA',
+                                                   interactive=True, lines=5)
+                            example_target = gr.Button(value='Example: Human MAPK14', elem_id='example')
+                            with gr.Row():
+                                with gr.Column(scale=1):
+                                    with gr.Group():
+                                        with gr.Row():
+                                            target_input_type = gr.Radio(label='Target input type',
+                                                                         choices=['Sequence', 'UniProt ID', 'Gene symbol'],
+                                                                         value='Sequence')
+                                            target_query = gr.Textbox(label='UniProt ID/Accession',
+                                                                      visible=False, interactive=True)
+                                        target_upload_btn = gr.UploadButton(label='Upload a FASTA file',
+                                                                            type='binary',
+                                                                            visible=True, variant='primary',
+                                                                            size='lg', elem_classes="upload_button")
+                                        target_query_btn = gr.Button(value='Query the sequence', variant='primary',
+                                                                     elem_classes='upload_button', visible=False)
+                                with gr.Column(scale=1):
+                                    with gr.Row():
+                                        with gr.Group():
+                                            drug_screen_target_family = gr.Dropdown(
+                                                choices=list(TARGET_FAMILY_MAP.keys()),
+                                                value='General',
+                                                label='Target family', interactive=True)
+                                            # with gr.Column(scale=1, min_width=24):
+                                            auto_detect_btn = gr.Button(value='Auto-detect', variant='primary')
+                                        HelpTip(
+                                            "Target amino acid sequence in the FASTA format. Alternatively, you may use a "
+                                            "UniProt ID/accession to query UniProt database for the sequence of your target"
+                                            "of interest. You can also search on databases like UniProt, RCSB PDB, "
+                                            "NCBI Protein for the FASTA string representing your target of interest. If "
+                                            "the input FASTA contains multiple entities, only the first one will be used."
+                                        )
+                        with gr.Column(variant='panel'):
+                            with gr.Group():
+                                drug_library = gr.Radio(label='Drug library',
+                                                        choices=list(DRUG_LIBRARY_MAP.keys()) + ['Upload a drug library'])
+                                drug_library_upload = gr.File(label='Custom drug library file', visible=True)
+                    with gr.Row(variant='panel'):
+                        drug_screen_task = gr.Radio(list(TASK_MAP.keys()), label='Task',
+                                                    value='Drug-target interaction')
+                        with gr.Column(scale=2):
+                            with gr.Group():
+                                drug_screen_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Model')
+                                recommend_btn = gr.Button(value='Recommend a model', variant='primary')
+                            HelpTip("We recommend the appropriate model for your use case based on model performance "
+                                    "in drug-target interaction or binding affinity prediction "
+                                    "benchmarked on different target families and real-world data scenarios.")
+                    # drug_screen_email = gr.Textbox(
+                    #     label='Email (optional)',
+                    #     info="Your email will be used to send you notifications when your job finishes."
+                    # )
+                    with gr.Row(visible=True):
+                        drug_screen_clr_btn = gr.ClearButton()
+                        drug_screen_btn = gr.Button(value='SCREEN', variant='primary')
+                    # TODO Modify the pd df directly with df['X2'] = target
+            screen_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
+            screen_waiting = gr.Markdown("""
+            <center>Your job is running... It might take a few minutes.
+            When it's done, you will be redirected to the report page.
+            Meanwhile, please leave the page on.</center>
+            """, visible=False)
+        with gr.TabItem(label='Target protein identification', id=1):
             gr.Markdown('''
+                # <center>DeepSEQreen Target Protein Identification</center>
+                <center>
+                To predict interactions/binding affinities of a single drug against a library of targets.
+                </center>
+                ''')
+            with gr.Blocks() as identify_block:
+                with gr.Column() as identify_page:
+                    with gr.Row():
+                        with gr.Group():
+                            drug_type = gr.Dropdown(label='Drug input type',
+                                                    choices=['SMILES', 'SDF'],
+                                                    value='SMILES',
+                                                    scale=1,
+                                                    interactive=True)
+                            drug_upload = gr.UploadButton(label='⤒ Upload a file')
+                        drug_smiles = gr.Code(label='Drug canonical SMILES', interactive=True, scale=5, lines=5)
+                        with gr.Column(scale=1):
+                            HelpTip(
+                                """Drug molecule in the SMILES format. You may search on databases like
+                                NCBI PubChem, ChEMBL, and DrugBank for the SMILES strings
+                                representing your drugs of interest.
+                                """
+                            )
+                            example_drug = gr.Button(value='Example: Aspirin', elem_id='example')
+                    with gr.Column(variant='panel'):
+                        with gr.Group():
+                            target_library = gr.Radio(label='Target library',
+                                                      choices=list(TARGET_LIBRARY_MAP.keys()) + ['Upload a target library'])
+                            target_library_upload = gr.File(label='Custom target library file', visible=True)
+                    with gr.Row(visible=True):
+                        target_identify_task = gr.Dropdown(list(TASK_MAP.keys()), label='Task')
+                        HelpTip("Choose a preset model for making the predictions.")
+                        target_identify_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
+                        HelpTip("Choose the protein family of your target.")
+                        target_identify_target_family = gr.Dropdown(choices=['General'],
+                                                                    value='General',
+                                                                    label='Target family')
+                    # with gr.Row():
+                    #     target_identify_email = gr.Textbox(
+                    #         label='Email (optional)',
+                    #         info="Your email will be used to send you notifications when your job finishes."
+                    #     )
+                    with gr.Row(visible=True):
+                        target_identify_clr_btn = gr.ClearButton()
+                        target_identify_btn = gr.Button(value='IDENTIFY', variant='primary')
+            identify_data_for_predict = gr.File(visible=False, file_count="single", type='filepath')
+            identify_waiting = gr.Markdown(f"Your job is running... It might take a few minutes."
+                                           f"When it's done, you will be redirected to the report page. "
+                                           f"Meanwhile, please leave the page on.",
+                                           visible=False)
+        with gr.TabItem(label='Interaction pair inference', id=2):
+            gr.Markdown('''
+                # <center>DeepSEQreen Interaction Pair Inference</center>
+                <center>
+                To predict interactions/binding affinities between any drug-target pairs.
+                </center>
+                ''')
+            with gr.Blocks() as infer_block:
+                with gr.Column() as infer_page:
+                    HelpTip("Upload a custom drug-target pair dataset. See the documentation for details.")
+                    infer_data_for_predict = gr.File(
+                        label='Prediction dataset file', file_count="single", type='filepath')
+                    # TODO example dataset
+                    # TODO download example dataset
+                    with gr.Row(visible=True):
+                        pair_infer_task = gr.Dropdown(list(TASK_MAP.keys()), label='Task')
+                        HelpTip("Choose a preset model for making the predictions.")
+                        pair_infer_preset = gr.Dropdown(list(PRESET_MAP.keys()), label='Preset')
+                        HelpTip("Choose the protein family of your target.")
+                        pair_infer_target_family = gr.Dropdown(choices=['General'],
+                                                               label='Target family',
+                                                               value='General')
+                    # with gr.Row():
+                    #     pair_infer_email = gr.Textbox(
+                    #         label='Email (optional)',
+                    #         info="Your email will be used to send you notifications when your job finishes."
+                    #     )
+                    with gr.Row(visible=True):
+                        pair_infer_clr_btn = gr.ClearButton()
+                        pair_infer_btn = gr.Button(value='INFER', variant='primary')
+            infer_waiting = gr.Markdown(f"Your job is running... It might take a few minutes."
+                                        f"When it's done, you will be redirected to the report page. "
+                                        f"Meanwhile, please leave the page on.",
+                                        visible=False)
+        with gr.TabItem(label='Chemical property report', id=3):
+            with gr.Blocks() as report:
+                gr.Markdown('''
+                # <center>DeepSEQreen Chemical Property Report</center>
+                <center>
+                To compute chemical properties for the predictions of drug hit screening,
+                target protein identification, and interaction pair inference. You may also upload
+                your own dataset.
+                </center>
+                ''')
+                with gr.Row():
+                    file_for_report = gr.File(interactive=True, type='filepath')
+                    # df_original = gr.Dataframe(type="pandas", interactive=False, visible=False)
+                    scores = gr.CheckboxGroup(list(SCORE_MAP.keys()), label='Scores')
+                    filters = gr.CheckboxGroup(list(FILTER_MAP.keys()), label='Filters')
+                with gr.Row():
+                    clear_btn = gr.ClearButton()
+                    analyze_btn = gr.Button('REPORT', variant='primary')
+                with gr.Row():
+                    with gr.Column(scale=3):
+                        html_report = gr.HTML()  # label='Results', visible=True)
+                    ranking_pie_chart = gr.Plot(visible=False)
+                with gr.Row():
+                    csv_download_btn = gr.Button('Download report (HTML)', variant='primary')
+                    html_download_btn = gr.Button('Download raw data (CSV)', variant='primary')
+    def target_input_type_select(input_type):
+        match input_type:
+            case 'UniProt ID':
+                return [gr.UploadButton(visible=False),
+                        gr.Textbox(visible=True, label='UniProt ID/accession', info=None, value=''),
+                        gr.Button(visible=True)]
+            case 'Gene symbol':
+                return [gr.UploadButton(visible=False),
+                        gr.Textbox(visible=True, label='Gene symbol/name', info='Organism: human', value=''),
+                        gr.Button(visible=True)]
+            case 'Sequence':
+                return [gr.UploadButton(visible=True),
+                        gr.Textbox(visible=False), gr.Button(visible=False)]
+    target_input_type.select(fn=target_input_type_select,
+                             inputs=target_input_type, outputs=[target_upload_btn, target_query, target_query_btn],
+                             show_progress=False)
+    def uniprot_query(query, input_type):
+        fasta_seq = ''
+        query = query.strip()
+        match input_type:
+            case 'UniProt ID':
+                query = f"{query.strip()}.fasta"
+            case 'Gene symbol':
+                query = f'search?query=organism_id:9606+AND+gene:{query}&format=fasta'
+        try:
+            fasta = SESSION.get(UNIPROT_ENDPOINT.format(query=query))
+            fasta.raise_for_status()
+            fasta_seq = fasta.text
+        except Exception as e:
+            raise gr.Warning(f"Failed to query FASTA from UniProt due to {str(e)}")
+        finally:
+            return fasta_seq
+    target_upload_btn.upload(fn=lambda x: x.decode(), inputs=target_upload_btn, outputs=target_fasta)
+    target_query_btn.click(uniprot_query, inputs=[target_query, target_input_type], outputs=target_fasta)
+    target_fasta.focus(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress=False)
+    target_fasta.blur(fn=wrap_text, inputs=target_fasta, outputs=target_fasta, show_progress=False)
+    drug_smiles.focus(fn=wrap_text, inputs=drug_smiles, outputs=drug_smiles, show_progress=False)
+    drug_smiles.blur(fn=wrap_text, inputs=drug_smiles, outputs=drug_smiles, show_progress=False)
+    def example_fill(input_type):
+        match input_type:
+            case 'UniProt ID':
+                query = 'Q16539'
+            case 'Gene symbol':
+                query = 'MAPK14'
+            case _:
+                query = ''
+        return {target_query: query,
+                target_fasta: """
+>sp|Q16539|MK14_HUMAN Mitogen-activated protein kinase 14 OS=Homo sapiens OX=9606 GN=MAPK14 PE=1 SV=3
+MSQERPTFYRQELNKTIWEVPERYQNLSPVGSGAYGSVCAAFDTKTGLRVAVKKLSRPFQ
+SIIHAKRTYRELRLLKHMKHENVIGLLDVFTPARSLEEFNDVYLVTHLMGADLNNIVKCQ
+KLTDDHVQFLIYQILRGLKYIHSADIIHRDLKPSNLAVNEDCELKILDFGLARHTDDEMT
+GYVATRWYRAPEIMLNWMHYNQTVDIWSVGCIMAELLTGRTLFPGTDHIDQLKLILRLVG
+TPGAELLKKISSESARNYIQSLTQMPKMNFANVFIGANPLAVDLLEKMLVLDSDKRITAA
+QALAHAYFAQYHDPDDEPVADPYDQSFESRDLLIDEWKSLTYDEVISFVPPPLDQEEMES
+"""}
+    example_target.click(fn=example_fill, inputs=target_input_type,
+                         outputs=[target_query, target_fasta], show_progress=False)
+    example_drug.click(fn=lambda: 'CC(=O)Oc1ccccc1C(=O)O', outputs=drug_smiles, show_progress=False)
+    def drug_screen_validate(fasta, library, library_upload, state):
+        if not state:
+            def process_target_fasta(sequence):
+                lines = sequence.strip().split("\n")
+                if lines[0].startswith(">"):
+                    lines = lines[1:]
+                return ''.join(lines).split(">")[0]
+            fasta = process_target_fasta(fasta)
+            err = validate_seq_str(fasta, FASTA_PAT)
+            if err:
+                raise gr.Error(f'Found error(s) in your target fasta input: {err}')
+            if library in DRUG_LIBRARY_MAP.keys():
+                screen_df = pd.read_csv(Path('data/drug_libraries', DRUG_LIBRARY_MAP[library]))
+            else:
+                screen_df = pd.read_csv(library_upload)
+                validate_columns(screen_df, ['X1'])
+            screen_df['X2'] = fasta
+            job_id = uuid4()
+            temp_file = Path(f'{job_id}_temp.csv').resolve()
+            screen_df.to_csv(temp_file)
+            if temp_file.is_file():
+                return {screen_data_for_predict: str(temp_file),
+                        screen_flag: job_id,
+                        run_state: job_id}
+        else:
+            gr.Warning('You have another prediction job '
+                       '(drug hit screening, target protein identification, or interation pair inference) '
+                       'running in the session right now. '
+                       'Please submit another job when your current job has finished.')
+            return {screen_flag: False}
+    def target_identify_validate(smiles, library, library_upload, state):
+        if not state:
+            err = validate_seq_str(smiles, SMILES_PAT)
+            if err:
+                raise gr.Error(f'Found error(s) in your compound SMILES input: {err}')
+            if library in TARGET_LIBRARY_MAP.keys():
+                identify_df = pd.read_csv(TARGET_LIBRARY_MAP['target_library'])
+            else:
+                identify_df = pd.read_csv(library_upload)
+                validate_columns(identify_df, ['X2'])
+            identify_df['X1'] = smiles
+            job_id = uuid4()
+            temp_file = Path(f'{job_id}_temp.csv').resolve()
+            identify_df.to_csv(temp_file)
+            if temp_file.is_file():
+                return {identify_data_for_predict: str(temp_file),
+                        identify_flag: gr.State(job_id),
+                        run_state: gr.State(job_id)}
+        else:
+            gr.Warning('You have another prediction job '
+                       '(drug hit screening, target protein identification, or interation pair inference) '
+                       'running in the session right now. '
+                       'Please submit another job when your current job has finished.')
+            return {identify_flag: False}
+    def pair_infer_validate(drug_target_pair_upload, run_state):
+        if not run_state:
+            df = pd.read_csv(drug_target_pair_upload)
+            validate_columns(df, ['X1', 'X2'])
+            df['X1_ERR'] = df['X1'].swifter.apply(
+                validate_seq_str, regex=SMILES_PAT)
+            df['X2_ERR'] = df['X2'].swifter.apply(
+                validate_seq_str, regex=FASTA_PAT)
+            if not df['X1_ERR'].isna().all():
+                raise gr.Error(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+            if not df['X2_ERR'].isna().all():
+                raise gr.Error(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
+            job_id = uuid4()
+            return {infer_flag: gr.State(job_id),
+                    run_state: gr.State(job_id)}
+        else:
+            gr.Warning('You have another prediction job '
+                       '(drug hit screening, target protein identification, or interation pair inference) '
+                       'running in the session right now. '
+                       'Please submit another job when your current job has finished.')
+            return {infer_flag: False}
+    drug_screen_btn.click(
+        fn=drug_screen_validate,
+        inputs=[target_fasta, drug_library, drug_library_upload, run_state],  # , drug_screen_email],
+        outputs=[screen_data_for_predict, screen_flag, run_state]
+    ).then(
+        fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True)],
+        outputs=[screen_page, screen_waiting]
+    ).then(
+        fn=submit_predict,
+        inputs=[screen_data_for_predict, drug_screen_task, drug_screen_preset,
+                drug_screen_target_family, screen_flag],  # , drug_screen_email],
+        outputs=[file_for_report, run_state]
+    ).then(
+        fn=lambda: [gr.Column(visible=True), gr.Markdown(visible=False)],
+        outputs=[screen_page, screen_waiting]
+    )
+    target_identify_btn.click(
+        fn=target_identify_validate,
+        inputs=[drug_smiles, target_library, target_library_upload, run_state], # , drug_screen_email],
+        outputs=[identify_data_for_predict, identify_flag, run_state]
+    ).then(
+        fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True)],
+        outputs=[identify_page, identify_waiting]
+    ).then(
+        fn=submit_predict,
+        inputs=[identify_data_for_predict, target_identify_task, target_identify_preset,
+                target_identify_target_family, identify_flag],  # , target_identify_email],
+        outputs=[file_for_report, run_state]
+    ).then(
+        fn=lambda: [gr.Column(visible=True), gr.Markdown(visible=False)],
+        outputs=[identify_page, identify_waiting]
+    )
+    pair_infer_btn.click(
+        fn=pair_infer_validate,
+        inputs=[infer_data_for_predict, run_state],  # , drug_screen_email],
+        outputs=[infer_flag, run_state]
+    ).then(
+        fn=lambda: [gr.Column(visible=False), gr.Markdown(visible=True)],
+        outputs=[infer_page, infer_waiting]
+    ).then(
+        fn=submit_predict,
+        inputs=[infer_data_for_predict, pair_infer_task, pair_infer_preset,
+                pair_infer_target_family, infer_flag],  # , pair_infer_email],
+        outputs=[file_for_report, run_state]
+    ).then(
+        fn=lambda: [gr.Column(visible=True), gr.Markdown(visible=False)],
+        outputs=[infer_page, infer_waiting]
+    )
+    # TODO background job from these 3 pipelines to update file_for_report
+    file_for_report.change(fn=update_df, inputs=file_for_report, outputs=[html_report, ranking_pie_chart])
+    analyze_btn.click(fn=submit_report, inputs=[scores, filters], outputs=[html_report, ranking_pie_chart])
+    # screen_waiting.change(fn=check_job_status, inputs=run_state, outputs=[pair_waiting, tabs, file_for_report],
+    #                       every=5)
+    # identify_waiting.change(fn=check_job_status, inputs=run_state, outputs=[identify_waiting, tabs, file_for_report],
+    #                         every=5)
+    # pair_waiting.change(fn=check_job_status, inputs=run_state, outputs=[pair_waiting, tabs, file_for_report],
+    #                     every=5)
+    # demo.load(None, None, None, js="() => {document.body.classList.remove('dark')}")
+if __name__ == "__main__":
+    screen_block.queue(max_size=2)
+    identify_block.queue(max_size=2)
+    infer_block.queue(max_size=2)
+    report.queue(max_size=20)
+    # SCHEDULER.add_job(func=file_cleanup(), trigger="interval", seconds=60)
+    # SCHEDULER.start()
+    demo.launch(
+        # debug=True,
+        show_api=False,
+        # favicon_path=,
+        # inline=False
+        debug=True
+    )

data/target_libraries/ChEMBL33_all_spe_single_prot_info.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

deepscreen/__init__.py CHANGED Viewed

@@ -20,9 +20,9 @@ OmegaConf.register_new_resolver("eval", eval)
 def sanitize_path(path_str: str):
     """
-    Sanitize a string for path creation by replacing unsafe characters.
     """
-    return path_str.replace("/", ".").replace("\\", ".").replace(":", "-")
 OmegaConf.register_new_resolver("sanitize_path", sanitize_path)

 def sanitize_path(path_str: str):
     """
+    Sanitize a string for path creation by replacing unsafe characters and cutting length to 255 (OS limitation).
     """
+    return path_str.replace("/", ".").replace("\\", ".").replace(":", "-")[:255]
 OmegaConf.register_new_resolver("sanitize_path", sanitize_path)

deepscreen/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/__pycache__/__init__.cpython-311.pyc and b/deepscreen/__pycache__/__init__.cpython-311.pyc differ

deepscreen/__pycache__/train.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/__pycache__/train.cpython-311.pyc and b/deepscreen/__pycache__/train.cpython-311.pyc differ

deepscreen/data/__pycache__/dti.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/data/__pycache__/dti.cpython-311.pyc and b/deepscreen/data/__pycache__/dti.cpython-311.pyc differ

deepscreen/data/dti.py CHANGED Viewed

@@ -1,3 +1,4 @@
 from functools import partial
 from numbers import Number
 from pathlib import Path
@@ -5,6 +6,7 @@ from typing import Any, Dict, Optional, Sequence, Union, Literal
 from lightning import LightningDataModule
 import pandas as pd
 from sklearn.preprocessing import LabelEncoder
 from torch.utils.data import Dataset, DataLoader
@@ -13,9 +15,33 @@ from deepscreen.utils import get_logger
 log = get_logger(__name__)
 # TODO: save a list of corrupted records
 class DTIDataset(Dataset):
     def __init__(
@@ -27,6 +53,7 @@ class DTIDataset(Dataset):
             protein_featurizer: callable,
             thresholds: Optional[Union[Number, Sequence[Number]]] = None,
             discard_intermediate: Optional[bool] = False,
     ):
         df = pd.read_csv(
             data_path,
@@ -58,40 +85,43 @@ class DTIDataset(Dataset):
         # Forward-fill all non-label columns
         df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
         if 'Y' in df:
-            log.info(f"Performing pre-transformation target validation.")
             # TODO: check sklearn.utils.multiclass.check_classification_targets
             match task:
                 case 'regression':
-                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
                         f"""`Y` must be numeric for `regression` task,
-                        but it has {set(df['Y'].apply(type))}."""
                 case 'binary':
                     if all(df['Y'].isin([0, 1])):
                         assert not thresholds, \
                             f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
-                            but still got `thresholds` {thresholds}.
-                            Double check your choices of `task` and `thresholds` and records in the `Y` column."""
                     else:
                         assert thresholds, \
                             f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
-                            but it has {pd.unique(df['Y'])}.
-                            You must set `thresholds` to discretize continuous labels."""
                 case 'multiclass':
                     assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
-                    if all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)):
                         assert not thresholds, \
                             f"""`Y` is already non-negative integers for
-                            `multiclass` (classification) `task`, but still got `thresholds` {thresholds}.
                             Double check your choice of `task`, `thresholds` and records in the `Y` column."""
                     else:
                         assert thresholds, \
                             f"""`Y` must be non-negative integers for
                             `multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
-                            You must set `thresholds` to discretize continuous labels."""
             if 'U' in df.columns:
                 units = df['U']
@@ -107,37 +137,51 @@ class DTIDataset(Dataset):
             # Filter out rows with a NaN in Y (missing values)
             df.dropna(subset=['Y'], inplace=True)
-            log.info(f"Performing post-transformation target validation.")
             match task:
                 case 'regression':
                     df['Y'] = df['Y'].astype('float32')
-                    assert all(df['Y'].apply(lambda x: isinstance(x, Number))), \
                         f"""`Y` must be numeric for `regression` task,
-                        but after transformation it still has {set(df['Y'].apply(type))}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
                 case 'binary':
                     df['Y'] = df['Y'].astype('int')
                     assert all(df['Y'].isin([0, 1])), \
                         f"""`Y` must be 0 or 1 for `task=binary`, "
                         but after transformation it still has {pd.unique(df['Y'])}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
                 case 'multiclass':
                     df['Y'] = df['Y'].astype('int')
-                    assert all(df['Y'].apply(lambda x: x.is_integer() and x >= 0)), \
                         f"""Y must be non-negative integers for `task=multiclass`
                         but after transformation it still has {pd.unique(df['Y'])}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
                     target_n_unique = df['Y'].nunique()
                     assert target_n_unique == num_classes, \
                         f"""You have set `num_classes` for `task=multiclass` to {num_classes},
                         but after transformation Y still has {target_n_unique} unique labels.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
-        # Indexed protein/FASTA for retrieval metrics
-        df['IDX'] = LabelEncoder().fit_transform(df['X2'])
         self.df = df
         self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
@@ -151,13 +195,13 @@ class DTIDataset(Dataset):
         return {
             'N': i,
             'X1': sample['X1'],
-            'X1^': self.drug_featurizer(sample['X1']),
-            'ID1': sample.get('ID1', sample['X1']),
             'X2': sample['X2'],
             'X2^': self.protein_featurizer(sample['X2']),
-            'ID2': sample.get('ID2', sample['X2']),
             'Y': sample.get('Y'),
-            'IDX': sample['IDX'],
         }

+import re
 from functools import partial
 from numbers import Number
 from pathlib import Path
 from lightning import LightningDataModule
 import pandas as pd
+import swifter
 from sklearn.preprocessing import LabelEncoder
 from torch.utils.data import Dataset, DataLoader
 log = get_logger(__name__)
+SMILES_PAT = r"[^A-Za-z0-9=#:+\-\[\]<>()/\\@%,.*]"
+FASTA_PAT = r"[^A-Z*\-]"
+def validate_seq_str(seq, regex):
+    if seq:
+        err_charset = set(re.findall(regex, seq))
+        if not err_charset:
+            return None
+        else:
+            return ', '.join(err_charset)
+    else:
+        return 'Empty string'
 # TODO: save a list of corrupted records
+def rdkit_canonicalize(smiles):
+    from rdkit import Chem
+    try:
+        mol = Chem.MolFromSmiles(smiles)
+        cano_smiles = Chem.MolToSmiles(mol)
+        return cano_smiles
+    except Exception as e:
+        log.warning(f'Failed to canonicalize SMILES using RDKIT due to {str(e)}. Returning original SMILES: {smiles}')
+        return smiles
 class DTIDataset(Dataset):
     def __init__(
             protein_featurizer: callable,
             thresholds: Optional[Union[Number, Sequence[Number]]] = None,
             discard_intermediate: Optional[bool] = False,
+            query: Optional[str] = 'X2'
     ):
         df = pd.read_csv(
             data_path,
         # Forward-fill all non-label columns
         df.loc[:, df.columns != 'Y'] = df.loc[:, df.columns != 'Y'].ffill(axis=0)
+        # TODO potentially allow running through the whole data validation process
+        # error = False
         if 'Y' in df:
+            log.info(f"Validating labels (`Y`)...")
             # TODO: check sklearn.utils.multiclass.check_classification_targets
             match task:
                 case 'regression':
+                    assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
                         f"""`Y` must be numeric for `regression` task,
+                        but it has {set(df['Y'].swifter.apply(type))}."""
                 case 'binary':
                     if all(df['Y'].isin([0, 1])):
                         assert not thresholds, \
                             f"""`Y` is already 0 or 1 for `binary` (classification) `task`,
+                            but still got `thresholds` ({thresholds}).
+                            Double check your choices of `task` and `thresholds`, and records in the `Y` column."""
                     else:
                         assert thresholds, \
                             f"""`Y` must be 0 or 1 for `binary` (classification) `task`,
+                            but it has {pd.unique(df['Y'])}.
+                            You may set `thresholds` to discretize continuous labels."""  # TODO print err idx instead
                 case 'multiclass':
                     assert num_classes >= 3, f'`num_classes` for `task=multiclass` must be at least 3.'
+                    if all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)):
                         assert not thresholds, \
                             f"""`Y` is already non-negative integers for
+                            `multiclass` (classification) `task`, but still got `thresholds` ({thresholds}).
                             Double check your choice of `task`, `thresholds` and records in the `Y` column."""
                     else:
                         assert thresholds, \
                             f"""`Y` must be non-negative integers for
                             `multiclass` (classification) 'task',but it has {pd.unique(df['Y'])}.
+                            You must set `thresholds` to discretize continuous labels."""  # TODO print err idx instead
             if 'U' in df.columns:
                 units = df['U']
             # Filter out rows with a NaN in Y (missing values)
             df.dropna(subset=['Y'], inplace=True)
             match task:
                 case 'regression':
                     df['Y'] = df['Y'].astype('float32')
+                    assert all(df['Y'].swifter.apply(lambda x: isinstance(x, Number))), \
                         f"""`Y` must be numeric for `regression` task,
+                        but after transformation it still has {set(df['Y'].swifter.apply(type))}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                    # TODO print err idx instead
                 case 'binary':
                     df['Y'] = df['Y'].astype('int')
                     assert all(df['Y'].isin([0, 1])), \
                         f"""`Y` must be 0 or 1 for `task=binary`, "
                         but after transformation it still has {pd.unique(df['Y'])}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                    # TODO print err idx instead
                 case 'multiclass':
                     df['Y'] = df['Y'].astype('int')
+                    assert all(df['Y'].swifter.apply(lambda x: x.is_integer() and x >= 0)), \
                         f"""Y must be non-negative integers for `task=multiclass`
                         but after transformation it still has {pd.unique(df['Y'])}.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+                    # TODO print err idx instead
                     target_n_unique = df['Y'].nunique()
                     assert target_n_unique == num_classes, \
                         f"""You have set `num_classes` for `task=multiclass` to {num_classes},
                         but after transformation Y still has {target_n_unique} unique labels.
                         Double check your choices of `task` and `thresholds` and records in the `Y` and `U` columns."""
+        log.info("Validating SMILES (`X1`)...")
+        df['X1_ERR'] = df['X1'].swifter.progress_bar(
+            desc="Validating SMILES...").apply(validate_seq_str, regex=SMILES_PAT)
+        if not df['X1_ERR'].isna().all():
+            raise Exception(f"Encountered invalid SMILES:\n{df[~df['X1_ERR'].isna()][['X1', 'X1_ERR']]}")
+        df['X1^'] = df['X1'].apply(rdkit_canonicalize)  # swifter
+        log.info("Validating FASTA (`X2`)...")
+        df['X2'] = df['X2'].str.upper()
+        df['X2_ERR'] = df['X2'].swifter.progress_bar(
+            desc="Validating FASTA...").apply(validate_seq_str, regex=FASTA_PAT)
+        if not df['X2_ERR'].isna().all():
+            raise Exception(f"Encountered invalid FASTA:\n{df[~df['X2_ERR'].isna()][['X2', 'X2_ERR']]}")
+        # FASTA/SMILES indices as query for retrieval metrics like enrichment factor and hit rate
+        if query:
+            df['ID^'] = LabelEncoder().fit_transform(df[query])
         self.df = df
         self.drug_featurizer = drug_featurizer if drug_featurizer is not None else (lambda x: x)
         return {
             'N': i,
             'X1': sample['X1'],
+            'X1^': self.drug_featurizer(sample['X1^']),
+            'ID1': sample.get('ID1'),
             'X2': sample['X2'],
             'X2^': self.protein_featurizer(sample['X2']),
+            'ID2': sample.get('ID2'),
             'Y': sample.get('Y'),
+            'ID^': sample.get('ID^'),
         }

deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc and b/deepscreen/data/featurizers/__pycache__/__init__.cpython-311.pyc differ

deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc and b/deepscreen/data/featurizers/__pycache__/categorical.cpython-311.pyc differ

deepscreen/data/featurizers/__pycache__/graph.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/data/featurizers/__pycache__/graph.cpython-311.pyc and b/deepscreen/data/featurizers/__pycache__/graph.cpython-311.pyc differ

deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc and b/deepscreen/data/featurizers/__pycache__/token.cpython-311.pyc differ

deepscreen/data/featurizers/categorical.py CHANGED Viewed

@@ -2,20 +2,20 @@ import numpy as np
 # Sets of KNOWN characters in SMILES and FASTA sequences
 # Use list instead of set to preserve character order
-SMILES_CHARSET = ('#', '%', ')', '(', '+', '-', '.', '1', '0', '3', '2', '5', '4',
-                  '7', '6', '9', '8', '=', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I',
-                  'H', 'K', 'M', 'L', 'O', 'N', 'P', 'S', 'R', 'U', 'T', 'W', 'V',
-                  'Y', '[', 'Z', ']', '_', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i',
-                  'h', 'm', 'l', 'o', 'n', 's', 'r', 'u', 't', 'y')
-FASTA_CHARSET = ('A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'O',
-                 'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z')
 # Check uniqueness, create character-index dicts, and add '?' for unknown characters as index 0
-assert len(SMILES_CHARSET) == len(set(SMILES_CHARSET)), 'SMILES_CHARSET has duplicate characters.'
-SMILES_CHARSET_IDX = {character: index+1 for index, character in enumerate(SMILES_CHARSET)} | {'?': 0}
-assert len(FASTA_CHARSET) == len(set(FASTA_CHARSET)), 'FASTA_CHARSET has duplicate characters.'
-FASTA_CHARSET_IDX = {character: index+1 for index, character in enumerate(FASTA_CHARSET)} | {'?': 0}
 def sequence_to_onehot(sequence: str, charset, max_sequence_length: int):
@@ -40,7 +40,7 @@ def sequence_to_label(sequence: str, charset, max_sequence_length: int):
     return label
-def smiles_to_onehot(smiles: str, smiles_charset=SMILES_CHARSET, max_sequence_length: int = 100):  # , in_channels: int = len(SMILES_CHARSET)
     # assert len(SMILES_CHARSET) == len(set(SMILES_CHARSET)), 'SMILES_CHARSET has duplicate characters.'
     # onehot = np.zeros((max_sequence_length, len(SMILES_CHARSET_IDX)))
     # for index, character in enumerate(smiles[:max_sequence_length]):
@@ -49,7 +49,7 @@ def smiles_to_onehot(smiles: str, smiles_charset=SMILES_CHARSET, max_sequence_le
     return sequence_to_onehot(smiles, smiles_charset, max_sequence_length)
-def smiles_to_label(smiles: str, smiles_charset=SMILES_CHARSET, max_sequence_length: int = 100):  # , in_channels: int = len(SMILES_CHARSET)
     # label = np.zeros(max_sequence_length)
     # for index, character in enumerate(smiles[:max_sequence_length]):
     #     label[index] = SMILES_CHARSET_IDX.get(character, 0)
@@ -57,7 +57,7 @@ def smiles_to_label(smiles: str, smiles_charset=SMILES_CHARSET, max_sequence_len
     return sequence_to_label(smiles, smiles_charset, max_sequence_length)
-def fasta_to_onehot(fasta: str, fasta_charset=FASTA_CHARSET, max_sequence_length: int = 1000):  # in_channels: int = len(FASTA_CHARSET)
     # onehot = np.zeros((max_sequence_length, len(FASTA_CHARSET_IDX)))
     # for index, character in enumerate(fasta[:max_sequence_length]):
     #     onehot[index, FASTA_CHARSET_IDX.get(character, 0)] = 1
@@ -65,7 +65,7 @@ def fasta_to_onehot(fasta: str, fasta_charset=FASTA_CHARSET, max_sequence_length
     return sequence_to_onehot(fasta, fasta_charset, max_sequence_length)
-def fasta_to_label(fasta: str, fasta_charset=FASTA_CHARSET, max_sequence_length: int = 1000):  # in_channels: int = len(FASTA_CHARSET)
     # label = np.zeros(max_sequence_length)
     # for index, character in enumerate(fasta[:max_sequence_length]):
     #     label[index] = FASTA_CHARSET_IDX.get(character, 0)

 # Sets of KNOWN characters in SMILES and FASTA sequences
 # Use list instead of set to preserve character order
+SMILES_VOCAB = ('#', '%', ')', '(', '+', '-', '.', '1', '0', '3', '2', '5', '4',
+                '7', '6', '9', '8', '=', 'A', 'C', 'B', 'E', 'D', 'G', 'F', 'I',
+                'H', 'K', 'M', 'L', 'O', 'N', 'P', 'S', 'R', 'U', 'T', 'W', 'V',
+                'Y', '[', 'Z', ']', '_', 'a', 'c', 'b', 'e', 'd', 'g', 'f', 'i',
+                'h', 'm', 'l', 'o', 'n', 's', 'r', 'u', 't', 'y')
+FASTA_VOCAB = ('A', 'C', 'B', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'O',
+               'N', 'Q', 'P', 'S', 'R', 'U', 'T', 'W', 'V', 'Y', 'X', 'Z')
 # Check uniqueness, create character-index dicts, and add '?' for unknown characters as index 0
+assert len(SMILES_VOCAB) == len(set(SMILES_VOCAB)), 'SMILES_CHARSET has duplicate characters.'
+SMILES_CHARSET_IDX = {character: index+1 for index, character in enumerate(SMILES_VOCAB)} | {'?': 0}
+assert len(FASTA_VOCAB) == len(set(FASTA_VOCAB)), 'FASTA_CHARSET has duplicate characters.'
+FASTA_CHARSET_IDX = {character: index+1 for index, character in enumerate(FASTA_VOCAB)} | {'?': 0}
 def sequence_to_onehot(sequence: str, charset, max_sequence_length: int):
     return label
+def smiles_to_onehot(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100):  # , in_channels: int = len(SMILES_CHARSET)
     # assert len(SMILES_CHARSET) == len(set(SMILES_CHARSET)), 'SMILES_CHARSET has duplicate characters.'
     # onehot = np.zeros((max_sequence_length, len(SMILES_CHARSET_IDX)))
     # for index, character in enumerate(smiles[:max_sequence_length]):
     return sequence_to_onehot(smiles, smiles_charset, max_sequence_length)
+def smiles_to_label(smiles: str, smiles_charset=SMILES_VOCAB, max_sequence_length: int = 100):  # , in_channels: int = len(SMILES_CHARSET)
     # label = np.zeros(max_sequence_length)
     # for index, character in enumerate(smiles[:max_sequence_length]):
     #     label[index] = SMILES_CHARSET_IDX.get(character, 0)
     return sequence_to_label(smiles, smiles_charset, max_sequence_length)
+def fasta_to_onehot(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000):  # in_channels: int = len(FASTA_CHARSET)
     # onehot = np.zeros((max_sequence_length, len(FASTA_CHARSET_IDX)))
     # for index, character in enumerate(fasta[:max_sequence_length]):
     #     onehot[index, FASTA_CHARSET_IDX.get(character, 0)] = 1
     return sequence_to_onehot(fasta, fasta_charset, max_sequence_length)
+def fasta_to_label(fasta: str, fasta_charset=FASTA_VOCAB, max_sequence_length: int = 1000):  # in_channels: int = len(FASTA_CHARSET)
     # label = np.zeros(max_sequence_length)
     # for index, character in enumerate(fasta[:max_sequence_length]):
     #     label[index] = FASTA_CHARSET_IDX.get(character, 0)

deepscreen/data/featurizers/monn.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import numpy as np
 from rdkit.Chem import MolFromSmiles
-from deepscreen.data.featurizers.categorical import FASTA_CHARSET, fasta_to_label
 from deepscreen.data.featurizers.graph import atom_features, bond_features

 import numpy as np
 from rdkit.Chem import MolFromSmiles
+from deepscreen.data.featurizers.categorical import FASTA_VOCAB, fasta_to_label
 from deepscreen.data.featurizers.graph import atom_features, bond_features

deepscreen/data/featurizers/token.py CHANGED Viewed

@@ -7,13 +7,12 @@ from typing import Optional, List
 import numpy as np
 from transformers import BertTokenizer
-SMI_REGEX_PATTERN = r"""(
-    \[[^\]]+\] # match anything inside square brackets
-    |Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p # match elements
-    |\(|\) # match parentheses
-    |\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2} # match various symbols
-    |[0-9] # match digits
-)"""
 def sequence_to_kmers(sequence, k=3):
@@ -30,17 +29,21 @@ def sequence_to_kmers(sequence, k=3):
 def sequence_to_word_embedding(sequence, model):
     """Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix"""
-    vec = np.zeros((len(sequence), 100))
     i = 0
-    for word in sequence:
-        vec[i,] = model.wv[word]
         i += 1
     return vec
 def sequence_to_token_ids(sequence, tokenizer):
     token_ids = tokenizer.encode(sequence)
-    return token_ids
 # def sequence_to_token_ids(sequence, tokenizer, max_length: int):
@@ -59,14 +62,14 @@ class SmilesTokenizer(BertTokenizer):
     Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
     implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
-    algorithm over SMILES strings using the tokenisation SMILES regex developed by Schwaller et. al.
     Please see https://github.com/huggingface/transformers
     and https://github.com/rxn4chemistry/rxnfp for more details.
     Examples
     --------
-    >>> tokenizer = SmilesTokenizer(vocab_path)
     >>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
     [12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]
@@ -81,9 +84,10 @@ class SmilesTokenizer(BertTokenizer):
     ----
     This class requires huggingface's transformers and tokenizers libraries to be installed.
     """
     def __init__(
             self,
-            vocab_file: str = '',
             regex_pattern: str = SMI_REGEX_PATTERN,
             # unk_token="[UNK]",
             # sep_token="[SEP]",

 import numpy as np
 from transformers import BertTokenizer
+SMI_REGEX_PATTERN = r"""(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"""
+# \[[^\]]+\] # match anything inside square brackets
+# |Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p # match elements
+# |\(|\) # match parentheses
+# |\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2} # match various symbols
+# |[0-9] # match digits
 def sequence_to_kmers(sequence, k=3):
 def sequence_to_word_embedding(sequence, model):
     """Get protein embedding, infer a list of 3-mers to (num_word, 100) matrix"""
+    kmers = sequence_to_kmers(sequence)
+    vec = np.zeros((len(kmers), 100))
     i = 0
+    for word in kmers:
+        try:
+            vec[i,] = model.wv[word]
+        except KeyError:
+            pass
         i += 1
     return vec
 def sequence_to_token_ids(sequence, tokenizer):
     token_ids = tokenizer.encode(sequence)
+    return np.array(token_ids)
 # def sequence_to_token_ids(sequence, tokenizer, max_length: int):
     Creates the SmilesTokenizer class. The tokenizer heavily inherits from the BertTokenizer
     implementation found in Huggingface's transformers library. It runs a WordPiece tokenization
+    algorithm over SMILES strings using the tokenization SMILES regex developed by Schwaller et al.
     Please see https://github.com/huggingface/transformers
     and https://github.com/rxn4chemistry/rxnfp for more details.
     Examples
     --------
+    >>> tokenizer = SmilesTokenizer(vocab_path, regex_pattern)
     >>> print(tokenizer.encode("CC(=O)OC1=CC=CC=C1C(=O)O"))
     [12, 16, 16, 17, 22, 19, 18, 19, 16, 20, 22, 16, 16, 22, 16, 16, 22, 16, 20, 16, 17, 22, 19, 18, 19, 13]
     ----
     This class requires huggingface's transformers and tokenizers libraries to be installed.
     """
     def __init__(
             self,
+            vocab_file: str = 'resources/vocabs/smiles.txt',
             regex_pattern: str = SMI_REGEX_PATTERN,
             # unk_token="[UNK]",
             # sep_token="[SEP]",

deepscreen/data/utils/__pycache__/collator.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/data/utils/__pycache__/collator.cpython-311.pyc and b/deepscreen/data/utils/__pycache__/collator.cpython-311.pyc differ

deepscreen/data/utils/__pycache__/label.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/data/utils/__pycache__/label.cpython-311.pyc and b/deepscreen/data/utils/__pycache__/label.cpython-311.pyc differ

deepscreen/data/utils/__pycache__/split.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/data/utils/__pycache__/split.cpython-311.pyc and b/deepscreen/data/utils/__pycache__/split.cpython-311.pyc differ

deepscreen/data/utils/collator.py CHANGED Viewed

@@ -72,46 +72,97 @@ def collate_fn(batch, automatic_padding=False, padding_value=0):
     return collate(batch, collate_fn_map=COLLATE_FN_MAP)
-class VariableLengthSequence(torch.Tensor):
-    """
-    A custom PyTorch Tensor class that is similar to PackedSequence, except it can be directly used as a batch tensor,
-    and it has an attribute called lengths, which signifies the length of each original sequence in the batch.
-    """
-    def __new__(cls, data, lengths):
-        """
-        Creates a new VariableLengthSequence object from the given data and lengths.
-        Args:
-            data (torch.Tensor): The batch collated tensor of shape (batch_size, max_length, *).
-            lengths (torch.Tensor): The lengths of each original sequence in the batch of shape (batch_size,).
-        Returns:
-            VariableLengthSequence: A new VariableLengthSequence object.
-        """
-        # Check the validity of the inputs
-        assert isinstance(data, torch.Tensor), "data must be a torch.Tensor"
-        assert isinstance(lengths, torch.Tensor), "lengths must be a torch.Tensor"
-        assert data.dim() >= 2, "data must have at least two dimensions"
-        assert lengths.dim() == 1, "lengths must have one dimension"
-        assert data.size(0) == lengths.size(0), "data and lengths must have the same batch size"
-        assert lengths.min() > 0, "lengths must be positive"
-        assert lengths.max() <= data.size(1), "lengths must not exceed the max length of data"
-        # Create a new tensor object from data
-        obj = super().__new__(cls, data)
-        # Set the lengths attribute
-        obj.lengths = lengths
-        return obj
-    def __repr__(self, *, tensor_contents=None):
-        """
-        Returns a string representation of the VariableLengthSequence object.
-        """
-        return f"VariableLengthSequence(data={self.data}, lengths={self.lengths})"
-    def __reduce_ex__(self, proto):
-        """
-        Enables pickling of the VariableLengthSequence object.
-        """
-        return type(self), (self.data, self.lengths)

     return collate(batch, collate_fn_map=COLLATE_FN_MAP)
+# class VariableLengthSequence(torch.Tensor):
+#     """
+#     A custom PyTorch Tensor class that is similar to PackedSequence, except it can be directly used as a batch tensor,
+#     and it has an attribute called lengths, which signifies the length of each original sequence in the batch.
+#     """
+#
+#     def __new__(cls, data, lengths):
+#         """
+#         Creates a new VariableLengthSequence object from the given data and lengths.
+#         Args:
+#             data (torch.Tensor): The batch collated tensor of shape (batch_size, max_length, *).
+#             lengths (torch.Tensor): The lengths of each original sequence in the batch of shape (batch_size,).
+#         Returns:
+#             VariableLengthSequence: A new VariableLengthSequence object.
+#         """
+#         # Check the validity of the inputs
+#         assert isinstance(data, torch.Tensor), "data must be a torch.Tensor"
+#         assert isinstance(lengths, torch.Tensor), "lengths must be a torch.Tensor"
+#         assert data.dim() >= 2, "data must have at least two dimensions"
+#         assert lengths.dim() == 1, "lengths must have one dimension"
+#         assert data.size(0) == lengths.size(0), "data and lengths must have the same batch size"
+#         assert lengths.min() > 0, "lengths must be positive"
+#         assert lengths.max() <= data.size(1), "lengths must not exceed the max length of data"
+#
+#         # Create a new tensor object from data
+#         obj = super().__new__(cls, data)
+#
+#         # Set the lengths attribute
+#         obj.lengths = lengths
+#
+#         return obj
+# class VariableLengthSequence(torch.Tensor):
+#     _lengths = torch.Tensor()
+#
+#     def __new__(cls, data, lengths, *args, **kwargs):
+#         self = super().__new__(cls, data, *args, **kwargs)
+#         self.lengths = lengths
+#         return self
+#
+#     def clone(self, *args, **kwargs):
+#         return VariableLengthSequence(super().clone(*args, **kwargs), self.lengths.clone())
+#
+#     def new_empty(self, *size):
+#         return VariableLengthSequence(super().new_empty(*size), self.lengths)
+#
+#     def to(self, *args, **kwargs):
+#         return VariableLengthSequence(super().to(*args, **kwargs), self.lengths.to(*args, **kwargs))
+#
+#     def __format__(self, format_spec):
+#         # Convert self to a string or a number here, depending on what you need
+#         return self.item().__format__(format_spec)
+#
+#     @property
+#     def lengths(self):
+#         return self._lengths
+#
+#     @lengths.setter
+#     def lengths(self, lengths):
+#         self._lengths = lengths
+#
+#     def cpu(self, *args, **kwargs):
+#         return VariableLengthSequence(super().cpu(*args, **kwargs), self.lengths.cpu(*args, **kwargs))
+#
+#     def cuda(self, *args, **kwargs):
+#         return VariableLengthSequence(super().cuda(*args, **kwargs), self.lengths.cuda(*args, **kwargs))
+#
+#     def pin_memory(self):
+#         return VariableLengthSequence(super().pin_memory(), self.lengths.pin_memory())
+#
+#     def share_memory_(self):
+#         super().share_memory_()
+#         self.lengths.share_memory_()
+#         return self
+#
+#     def detach_(self, *args, **kwargs):
+#         super().detach_(*args, **kwargs)
+#         self.lengths.detach_(*args, **kwargs)
+#         return self
+#
+#     def detach(self, *args, **kwargs):
+#         return VariableLengthSequence(super().detach(*args, **kwargs), self.lengths.detach(*args, **kwargs))
+#
+#     def record_stream(self, *args, **kwargs):
+#         super().record_stream(*args, **kwargs)
+#         self.lengths.record_stream(*args, **kwargs)
+#         return self
+    # @classmethod
+    # def __torch_function__(cls, func, types, args=(), kwargs=None):
+    #     return super().__torch_function__(func, types, args, kwargs) \
+    #         if cls.lengths is not None else torch.Tensor.__torch_function__(func, types, args, kwargs)

deepscreen/data/utils/label.py CHANGED Viewed

@@ -19,6 +19,7 @@ MOLARITY_TO_POTENCY = {
 }
 def molar_to_p(labels, units):
     assert units in MOLARITY_TO_POTENCY, f"Allowed units: {', '.join(MOLARITY_TO_POTENCY)}."

 }
+# TODO rewrite for swifter.apply
 def molar_to_p(labels, units):
     assert units in MOLARITY_TO_POTENCY, f"Allowed units: {', '.join(MOLARITY_TO_POTENCY)}."

deepscreen/gui/test.py ADDED Viewed

	@@ -0,0 +1,114 @@

+from pathlib import Path
+import gradio as gr
+# Use this in a notebook
+root = Path.cwd()
+drug_encoder_list = [f.stem for f in root.parent.joinpath("configs/model/drug_encoder").iterdir() if f.suffix == ".yaml"]
+drug_featurizer_list = [f.stem for f in root.parent.joinpath("configs/model/drug_featurizer").iterdir() if f.suffix == ".yaml"]
+protein_encoder_list = [f.stem for f in root.parent.joinpath("configs/model/protein_encoder").iterdir() if f.suffix == ".yaml"]
+protein_featurizer_list = [f.stem for f in root.parent.joinpath("configs/model/protein_featurizer").iterdir() if f.suffix == ".yaml"]
+classifier_list = [f.stem for f in root.parent.joinpath("configs/model/classifier").iterdir() if f.suffix == ".yaml"]
+preset_list = [f.stem for f in root.parent.joinpath("configs/model/preset").iterdir() if f.suffix == ".yaml"]
+from typing import Optional
+def drug_target_interaction(
+        binary: bool,
+        drug_encoder,
+        drug_featurizer,
+        protein_encoder,
+        protein_featurizer,
+        classifier,
+        preset,) -> Optional[float]:
+    return 1
+def drug_encoder(
+        binary: bool,
+        drug_encoder,
+        drug_featurizer,
+        protein_encoder,
+        protein_featurizer,
+        classifier,
+        preset,):
+    return
+def protein_encoder(
+        binary: bool,
+        drug_encoder,
+        drug_featurizer,
+        protein_encoder,
+        protein_featurizer,
+        classifier,
+        preset,):
+    return
+# demo = gr.Interface(
+#     fn=drug_target_interaction,
+#     inputs=[
+#         gr.Radio(["True", "False"]),
+#         gr.Dropdown(drug_encoder_list),
+#         gr.Dropdown(drug_featurizer_list),
+#         gr.Dropdown(protein_encoder_list),
+#         gr.Dropdown(protein_featurizer_list),
+#         gr.Dropdown(classifier_list),
+#         gr.Dropdown(preset_list),
+#     ],
+#     outputs=["number"],
+#     show_error=True,
+#
+# )
+#
+# demo.launch()
+from omegaconf import DictConfig, OmegaConf
+type_to_component_map = {list: gr.Text, int: gr.Number, float: gr.Number}
+def get_config_choices(config_path: str):
+    return [f.stem for f in Path("../../configs/", config_path).iterdir() if f.suffix == ".yaml"]
+def create_blocks_from_config(cfg: DictConfig):
+    with gr.Blocks() as blocks:
+        for key, value in cfg.items():
+            if type(value) in [int, float]:
+                component = gr.Number(value=value, label=key, interactive=True)
+            if type(value) in [dict, DictConfig]:
+                with gr.Tab(label=key):
+                    component = create_blocks_from_config(value)
+            else:
+                component = gr.Text(value=value, label=key, interactive=True)
+    return blocks
+def create_interface_from_config(fn: callable, cfg: DictConfig):
+    inputs = []
+    for key, value in OmegaConf.to_object(cfg).items():
+        component = type_to_component_map.get(type(value), gr.Text)
+        inputs.append(component(value=value, label=key, interactive=True))
+    interface = gr.Interface(fn=fn, inputs=inputs, outputs="label")
+    return interface
+import hydra
+with hydra.initialize(version_base=None, config_path="../../configs/"):
+    cfg = hydra.compose("train")

deepscreen/models/__pycache__/dti.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/models/__pycache__/dti.cpython-311.pyc and b/deepscreen/models/__pycache__/dti.cpython-311.pyc differ

deepscreen/models/dti.py CHANGED Viewed

@@ -66,7 +66,7 @@ class DTILightningModule(LightningModule):
     def forward(self, batch):
         output = self.predictor(batch['X1^'], batch['X2^'])
         target = batch.get('Y')
-        indexes = batch.get('IDX')
         preds = None
         loss = None

     def forward(self, batch):
         output = self.predictor(batch['X1^'], batch['X2^'])
         target = batch.get('Y')
+        indexes = batch.get('ID^')
         preds = None
         loss = None

deepscreen/models/loss/__pycache__/multitask_loss.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/models/loss/__pycache__/multitask_loss.cpython-311.pyc and b/deepscreen/models/loss/__pycache__/multitask_loss.cpython-311.pyc differ

deepscreen/models/metrics/bedroc.py CHANGED Viewed

@@ -40,3 +40,6 @@ class BEDROC(RetrievalMetric):
         rie_max = (1 - exp_a ** (-r_a)) / (r_a * (1 - exp_a ** (-1)))
         return (rie - rie_min) / (rie_max - rie_min)

         rie_max = (1 - exp_a ** (-r_a)) / (r_a * (1 - exp_a ** (-1)))
         return (rie - rie_min) / (rie_max - rie_min)
+    def plot(self, val=None, ax=None):
+        return self._plot(val, ax)

deepscreen/models/metrics/ci.py ADDED Viewed

	@@ -0,0 +1,39 @@

+import torch
+from torchmetrics import Metric
+from torchmetrics.utilities.checks import _check_same_shape
+from torchmetrics.utilities.imports import _MATPLOTLIB_AVAILABLE
+if not _MATPLOTLIB_AVAILABLE:
+    __doctest_skip__ = ["ConcordanceIndex.plot"]
+class ConcordanceIndex(Metric):
+    is_differentiable: bool = False
+    higher_is_better: bool = True
+    full_state_update: bool = False
+    plot_lower_bound: float = 0.5
+    plot_upper_bound: float = 1.0
+    def __init__(self, dist_sync_on_step=False):
+        super().__init__(dist_sync_on_step=dist_sync_on_step)
+        self.add_state("num_concordant", default=torch.tensor(0), dist_reduce_fx="sum")
+        self.add_state("num_valid", default=torch.tensor(0), dist_reduce_fx="sum")
+    def update(self, preds: torch.Tensor, target: torch.Tensor):
+        _check_same_shape(preds, target)
+        g = preds.unsqueeze(-1) - preds
+        g = (g == 0) * 0.5 + (g > 0)
+        f = (target.unsqueeze(-1) - target) > 0
+        f = torch.tril(f, diagonal=0)
+        self.num_concordant += torch.sum(torch.mul(g, f)).long()
+        self.num_valid += torch.sum(f).long()
+    def compute(self):
+        return torch.where(self.num_valid == 0, 0.0, self.num_concordant / self.num_valid)
+    def plot(self, val=None, ax=None):
+        return self._plot(val, ax)

deepscreen/models/metrics/ef.py CHANGED Viewed

@@ -5,7 +5,7 @@ from torchmetrics.retrieval.base import RetrievalMetric
 from torchmetrics.utilities.checks import _check_retrieval_functional_inputs
-class EF(RetrievalMetric):
     is_differentiable: bool = False
     higher_is_better: bool = True
     full_state_update: bool = False
@@ -29,3 +29,6 @@ class EF(RetrievalMetric):
         hits_total = target.sum()
         return hits_sampled / (hits_total * self.alpha)

 from torchmetrics.utilities.checks import _check_retrieval_functional_inputs
+class EnrichmentFactor(RetrievalMetric):
     is_differentiable: bool = False
     higher_is_better: bool = True
     full_state_update: bool = False
         hits_total = target.sum()
         return hits_sampled / (hits_total * self.alpha)
+    def plot(self, val=None, ax=None):
+        return self._plot(val, ax)

deepscreen/models/metrics/hit_rate.py CHANGED Viewed

@@ -31,3 +31,6 @@ class HitRate(RetrievalMetric):
         hits_sampled = target[idx].sum()
         return hits_sampled / n_sampled

         hits_sampled = target[idx].sum()
         return hits_sampled / n_sampled
+    def plot(self, val=None, ax=None):
+        return self._plot(val, ax)

deepscreen/models/metrics/rie.py CHANGED Viewed

@@ -4,6 +4,13 @@ from torchmetrics.retrieval.base import RetrievalMetric
 from torchmetrics.utilities.checks import _check_retrieval_functional_inputs
 class RIE(RetrievalMetric):
     is_differentiable: bool = False
     higher_is_better: bool = True
@@ -33,9 +40,5 @@ class RIE(RetrievalMetric):
         return calc_rie(n_total, active_ranks, r_a, exp_a)
-def calc_rie(n_total, active_ranks, r_a, exp_a):
-    numerator = (exp_a ** (- active_ranks / n_total)).sum()
-    denominator = (1 - exp_a ** (-1)) / (exp_a ** (1 / n_total) - 1)
-    return numerator / (r_a * denominator)

 from torchmetrics.utilities.checks import _check_retrieval_functional_inputs
+def calc_rie(n_total, active_ranks, r_a, exp_a):
+    numerator = (exp_a ** (- active_ranks / n_total)).sum()
+    denominator = (1 - exp_a ** (-1)) / (exp_a ** (1 / n_total) - 1)
+    return numerator / (r_a * denominator)
 class RIE(RetrievalMetric):
     is_differentiable: bool = False
     higher_is_better: bool = True
         return calc_rie(n_total, active_ranks, r_a, exp_a)
+    def plot(self, val=None, ax=None):
+        return self._plot(val, ax)

deepscreen/models/predictors/drug_vqa.py CHANGED Viewed

@@ -1,10 +1,11 @@
 from math import floor
 from typing import Literal
 import torch.nn as nn
 import torch
 import torch.nn.functional as F
-# from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence
 def conv(in_channels, out_channels, kernel_size, conv_dim, stride=1):
@@ -170,6 +171,8 @@ class DrugVQA(nn.Module):
         return nn.Sequential(*layers)
     def forward(self, enc_drug, enc_protein):
         smile_embed = self.embeddings(enc_drug.long())
         # self.hidden_state = tuple(hidden_state.to(smile_embed).detach() for hidden_state in self.hidden_state)
         outputs, hidden_state = self.lstm(smile_embed)

 from math import floor
+import re
 from typing import Literal
+import numpy as np
 import torch.nn as nn
 import torch
 import torch.nn.functional as F
 def conv(in_channels, out_channels, kernel_size, conv_dim, stride=1):
         return nn.Sequential(*layers)
     def forward(self, enc_drug, enc_protein):
+        enc_drug, _ = enc_drug
+        enc_protein, _ = enc_protein
         smile_embed = self.embeddings(enc_drug.long())
         # self.hidden_state = tuple(hidden_state.to(smile_embed).detach() for hidden_state in self.hidden_state)
         outputs, hidden_state = self.lstm(smile_embed)

deepscreen/models/predictors/transformer_cpi.py CHANGED Viewed

@@ -9,8 +9,7 @@ class TransformerCPI(nn.Module):
         super().__init__()
         self.encoder = Encoder(protein_dim, hidden_dim, n_layers, kernel_size, dropout)
-        self.decoder = Decoder(atom_dim, hidden_dim, n_layers, n_heads, pf_dim, DecoderLayer, SelfAttention,
-                               PositionwiseFeedforward, dropout)
         self.weight = nn.Parameter(torch.FloatTensor(atom_dim, atom_dim))
         self.init_weight()
@@ -23,18 +22,24 @@ class TransformerCPI(nn.Module):
         # adj = [batch,num_node, num_node]
         support = torch.matmul(input, self.weight)
         # support =[batch,num_node,atom_dim]
-        output = torch.bmm(adj, support)
         # output = [batch,num_node,atom_dim]
         return output
-    def forward(self, compound, adj, protein, atom_num, protein_num):
         # compound = [batch,atom_num, atom_dim]
         # adj = [batch,atom_num, atom_num]
         # protein = [batch,protein len, 100]
-        compound_max_len = compound.shape[1]
-        protein_max_len = protein.shape[1]
-        compound_mask, protein_mask = self.make_masks(atom_num, protein_num, compound_max_len, protein_max_len)
-        compound = self.gcn(compound, adj)
         # compound = torch.unsqueeze(compound, dim=0)
         # compound = [batch size=1 ,atom_num, atom_dim]
@@ -48,54 +53,6 @@ class TransformerCPI(nn.Module):
         # out = torch.squeeze(out, dim=0)
         return out
-    @staticmethod
-    def make_masks(atom_num, protein_num, compound_max_len, protein_max_len):
-        n_atom = len(atom_num)  # batch size
-        compound_mask = torch.zeros((n_atom, compound_max_len))
-        protein_mask = torch.zeros((n_atom, protein_max_len))
-        for i in range(n_atom):
-            compound_mask[i, :atom_num[i]] = 1
-            protein_mask[i, :protein_num[i]] = 1
-        compound_mask = compound_mask.unsqueeze(1).unsqueeze(3)
-        protein_mask = protein_mask.unsqueeze(1).unsqueeze(2)
-        return compound_mask, protein_mask
-    @staticmethod
-    def pack(atoms, adjs, proteins, labels):
-        atoms_len = 0
-        proteins_len = 0
-        N = len(atoms)
-        atom_num = []
-        for atom in atoms:
-            atom_num.append(atom.shape[0])
-            if atom.shape[0] >= atoms_len:
-                atoms_len = atom.shape[0]
-        protein_num = []
-        for protein in proteins:
-            protein_num.append(protein.shape[0])
-            if protein.shape[0] >= proteins_len:
-                proteins_len = protein.shape[0]
-        atoms_new = torch.zeros((N, atoms_len, 34))
-        for i, atom in enumerate(atoms):
-            a_len = atom.shape[0]
-            atoms_new[i, :a_len, :] = atom
-        adjs_new = torch.zeros((N, atoms_len, atoms_len))
-        for i, adj in adjs:
-            a_len = adj.shape[0]
-            adj = adj + torch.eye(a_len)
-            adjs_new[i, :a_len, :a_len] = adj
-        proteins_new = torch.zeros((N, proteins_len, 100))
-        for i, protein in enumerate(proteins):
-            a_len = protein.shape[0]
-            proteins_new[i, :a_len, :] = protein
-        return atoms_new, adjs_new, proteins_new, atom_num, protein_num
 class SelfAttention(nn.Module):
     def __init__(self, hidden_dim, n_heads, dropout):
@@ -114,7 +71,7 @@ class SelfAttention(nn.Module):
         self.do = nn.Dropout(dropout)
-        self.scale = torch.sqrt(torch.FloatTensor([hidden_dim // n_heads]))
     def forward(self, query, key, value, mask=None):
         bsz = query.shape[0]
@@ -164,7 +121,6 @@ class SelfAttention(nn.Module):
 class Encoder(nn.Module):
     """protein feature extraction."""
     def __init__(self, protein_dim, hidden_dim, n_layers, kernel_size, dropout):
         super().__init__()
@@ -176,7 +132,7 @@ class Encoder(nn.Module):
         self.dropout = dropout
         self.n_layers = n_layers
         # self.pos_embedding = nn.Embedding(1000, hidden_dim)
-        self.scale = torch.sqrt(torch.FloatTensor([0.5]))
         self.convs = nn.ModuleList(
             [nn.Conv1d(hidden_dim, 2 * hidden_dim, kernel_size, padding=(kernel_size - 1) // 2) for _ in
              range(self.n_layers)])  # convolutional layers
@@ -189,7 +145,7 @@ class Encoder(nn.Module):
         # pos = torch.arange(0, protein.shape[1]).unsqueeze(0).repeat(protein.shape[0], 1)
         # protein = protein + self.pos_embedding(pos)
         # protein = [batch size, protein len,protein_dim]
-        conv_input = self.fc(protein)
         # conv_input=[batch size,protein len,hid dim]
         # permute for convolutional layer
         conv_input = conv_input.permute(0, 2, 1)
@@ -239,7 +195,9 @@ class PositionwiseFeedforward(nn.Module):
 class DecoderLayer(nn.Module):
-    def __init__(self, hidden_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout):
         super().__init__()
         self.ln = nn.LayerNorm(hidden_dim)
         self.sa = self_attention(hidden_dim, n_heads, dropout)
@@ -262,8 +220,10 @@ class DecoderLayer(nn.Module):
 class Decoder(nn.Module):
     """ compound feature extraction."""
-    def __init__(self, atom_dim, hidden_dim, n_layers, n_heads, pf_dim, decoder_layer, self_attention,
-                 positionwise_feedforward, dropout):
         super().__init__()
         self.ln = nn.LayerNorm(hidden_dim)
         self.output_dim = atom_dim
@@ -277,12 +237,12 @@ class Decoder(nn.Module):
         self.dropout = dropout
         self.sa = self_attention(hidden_dim, n_heads, dropout)
         self.layers = nn.ModuleList(
-            [decoder_layer(hidden_dim, n_heads, pf_dim, self_attention, positionwise_feedforward, dropout)
              for _ in range(n_layers)])
         self.ft = nn.Linear(atom_dim, hidden_dim)
         self.do = nn.Dropout(dropout)
         self.fc_1 = nn.Linear(hidden_dim, 256)
-        self.fc_2 = nn.Linear(256, 2)
         self.gn = nn.GroupNorm(8, 256)
     def forward(self, trg, src, trg_mask=None, src_mask=None):
@@ -297,7 +257,7 @@ class Decoder(nn.Module):
         norm = F.softmax(norm, dim=1)  # norm = [batch size,compound len]
         # trg = torch.squeeze(trg,dim=0)
         # norm = torch.squeeze(norm,dim=0)
-        sum = torch.zeros((trg.shape[0], self.hidden_dim))
         for i in range(norm.shape[0]):
             for j in range(norm.shape[1]):
                 v = trg[i, j,]

         super().__init__()
         self.encoder = Encoder(protein_dim, hidden_dim, n_layers, kernel_size, dropout)
+        self.decoder = Decoder(atom_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout)
         self.weight = nn.Parameter(torch.FloatTensor(atom_dim, atom_dim))
         self.init_weight()
         # adj = [batch,num_node, num_node]
         support = torch.matmul(input, self.weight)
         # support =[batch,num_node,atom_dim]
+        output = torch.bmm(adj.float(), support.float())
         # output = [batch,num_node,atom_dim]
         return output
+    def forward(self, compound, protein):
+        compound, adj = compound
+        compound, compound_lengths = compound
+        adj, _ = adj
+        protein, protein_lengths = protein
         # compound = [batch,atom_num, atom_dim]
         # adj = [batch,atom_num, atom_num]
         # protein = [batch,protein len, 100]
+        compound_mask = torch.arange(compound.size(1), device=compound.device) >= compound_lengths.unsqueeze(1)
+        protein_mask = torch.arange(protein.size(1), device=protein.device) >= protein_lengths.unsqueeze(1)
+        compound_mask = compound_mask.unsqueeze(1).unsqueeze(3)
+        protein_mask = protein_mask.unsqueeze(1).unsqueeze(2)
+        compound = self.gcn(compound.float(), adj)
         # compound = torch.unsqueeze(compound, dim=0)
         # compound = [batch size=1 ,atom_num, atom_dim]
         # out = torch.squeeze(out, dim=0)
         return out
 class SelfAttention(nn.Module):
     def __init__(self, hidden_dim, n_heads, dropout):
         self.do = nn.Dropout(dropout)
+        self.scale = (hidden_dim // n_heads) ** 0.5
     def forward(self, query, key, value, mask=None):
         bsz = query.shape[0]
 class Encoder(nn.Module):
     """protein feature extraction."""
     def __init__(self, protein_dim, hidden_dim, n_layers, kernel_size, dropout):
         super().__init__()
         self.dropout = dropout
         self.n_layers = n_layers
         # self.pos_embedding = nn.Embedding(1000, hidden_dim)
+        self.scale = 0.5 ** 0.5
         self.convs = nn.ModuleList(
             [nn.Conv1d(hidden_dim, 2 * hidden_dim, kernel_size, padding=(kernel_size - 1) // 2) for _ in
              range(self.n_layers)])  # convolutional layers
         # pos = torch.arange(0, protein.shape[1]).unsqueeze(0).repeat(protein.shape[0], 1)
         # protein = protein + self.pos_embedding(pos)
         # protein = [batch size, protein len,protein_dim]
+        conv_input = self.fc(protein.float())
         # conv_input=[batch size,protein len,hid dim]
         # permute for convolutional layer
         conv_input = conv_input.permute(0, 2, 1)
 class DecoderLayer(nn.Module):
+    def __init__(self, hidden_dim, n_heads, pf_dim, dropout,
+                 self_attention=SelfAttention,
+                 positionwise_feedforward=PositionwiseFeedforward):
         super().__init__()
         self.ln = nn.LayerNorm(hidden_dim)
         self.sa = self_attention(hidden_dim, n_heads, dropout)
 class Decoder(nn.Module):
     """ compound feature extraction."""
+    def __init__(self, atom_dim, hidden_dim, n_layers, n_heads, pf_dim, dropout,
+                 decoder_layer=DecoderLayer,
+                 self_attention=SelfAttention,
+                 positionwise_feedforward=PositionwiseFeedforward):
         super().__init__()
         self.ln = nn.LayerNorm(hidden_dim)
         self.output_dim = atom_dim
         self.dropout = dropout
         self.sa = self_attention(hidden_dim, n_heads, dropout)
         self.layers = nn.ModuleList(
+            [decoder_layer(hidden_dim, n_heads, pf_dim, dropout, self_attention, positionwise_feedforward)
              for _ in range(n_layers)])
         self.ft = nn.Linear(atom_dim, hidden_dim)
         self.do = nn.Dropout(dropout)
         self.fc_1 = nn.Linear(hidden_dim, 256)
+        # self.fc_2 = nn.Linear(256, 2)
         self.gn = nn.GroupNorm(8, 256)
     def forward(self, trg, src, trg_mask=None, src_mask=None):
         norm = F.softmax(norm, dim=1)  # norm = [batch size,compound len]
         # trg = torch.squeeze(trg,dim=0)
         # norm = torch.squeeze(norm,dim=0)
+        sum = torch.zeros((trg.shape[0], self.hidden_dim), device=trg.device)
         for i in range(norm.shape[0]):
             for j in range(norm.shape[1]):
                 v = trg[i, j,]

deepscreen/models/predictors/transformer_cpi_2.py CHANGED Viewed

@@ -23,9 +23,8 @@ class TransformerCPI2(nn.Module):
         # adj_mat = [batch_size, atom_num, atom_num]
         # enc_protein = [batch_size, protein_len, 768]
         compound, adj = compound
         compound, compound_lengths = compound
-        adj, adj_lengths = adj
         protein, protein_lengths = protein
         # Add a global/master node to the compound
@@ -99,5 +98,5 @@ class Decoder(nn.Module):
         tgt = tgt.permute(1, 0, 2).contiguous()  # tgt = [batch_size, compound_len, hid_dim]
         x = tgt[:, 0, :]
         label = F.relu(self.fc_1(x))
-        label = self.fc_2(label)
         return label

         # adj_mat = [batch_size, atom_num, atom_num]
         # enc_protein = [batch_size, protein_len, 768]
         compound, adj = compound
+        adj, _ = adj
         compound, compound_lengths = compound
         protein, protein_lengths = protein
         # Add a global/master node to the compound
         tgt = tgt.permute(1, 0, 2).contiguous()  # tgt = [batch_size, compound_len, hid_dim]
         x = tgt[:, 0, :]
         label = F.relu(self.fc_1(x))
+        # label = self.fc_2(label)
         return label

deepscreen/utils/__pycache__/hydra.cpython-311.pyc CHANGED Viewed

Binary files a/deepscreen/utils/__pycache__/hydra.cpython-311.pyc and b/deepscreen/utils/__pycache__/hydra.cpython-311.pyc differ

deepscreen/utils/hydra.py CHANGED Viewed

@@ -1,8 +1,11 @@
 from pathlib import Path
 import re
 from typing import Any, Tuple
 import pandas as pd
 from hydra.core.hydra_config import HydraConfig
 from hydra.core.utils import _save_config
 from hydra.experimental.callbacks import Callback
@@ -21,21 +24,24 @@ class CSVExperimentSummary(Callback):
         self.filename = filename
         self.prefix = prefix if isinstance(prefix, str) else tuple(prefix)
         self.input_experiment_summary = None
     def on_multirun_start(self, config: DictConfig, **kwargs: Any) -> None:
-        if config.hydra.get('overrides'):
-            if config.hydra.overrides.task:
-                for i, override in enumerate(config.hydra.overrides.task):
-                    if override.startswith("ckpt_path"):
-                        ckpt_path = override.split('=', 1)[1]
-                        if ckpt_path.endswith(('.csv', '.txt', '.tsv', '.ssv', '.psv')):
-                            config.hydra.overrides.task[i] = self.parse_ckpt_path_from_experiment_summary(ckpt_path)
-                        break
-        elif config.hydra.sweeper.get('params'):
-            if config.hydra.sweeper.params.get('ckpt_path'):
-                ckpt_path = str(config.hydra.sweeper.params.ckpt_path).strip("'\"")
-                if ckpt_path.endswith(('.csv', '.txt', '.tsv', '.ssv', '.psv')):
-                    config.hydra.sweeper.params.ckpt_path = self.parse_ckpt_path_from_experiment_summary(ckpt_path)
     def on_job_end(self, config: DictConfig, job_return, **kwargs: Any) -> None:
         # Skip callback if job is DDP subprocess
@@ -43,6 +49,7 @@ class CSVExperimentSummary(Callback):
             return
         try:
             if config.hydra.mode == RunMode.RUN:
                 summary_file_path = Path(config.hydra.run.dir) / self.filename
             elif config.hydra.mode == RunMode.MULTIRUN:
@@ -56,21 +63,23 @@ class CSVExperimentSummary(Callback):
                 summary_df = pd.DataFrame()
             # Add job and override info
-            override_dict = dict(override.split('=', 1) for override in job_return.overrides)
-            override_dict['job_status'] = job_return.status.name
             # Add checkpoint info
-            if override_dict.get('ckpt_path'):
-                override_dict['ckpt_path'] = str(override_dict['ckpt_path']).strip("'\"")
-            if job_return.cfg.get('ckpt_path'):
-                ckpt_path = str(job_return.cfg.ckpt_path).strip("'\"")
-                if Path(ckpt_path).is_file():
-                    if override_dict.get('ckpt_path') and ckpt_path != override_dict['ckpt_path']:
-                        override_dict['previous_ckpt_path'] = override_dict['ckpt_path']
-                    override_dict['ckpt_path'] = ckpt_path
-            override_dict['epoch'] = int(re.search(r'epoch_(\d+)', override_dict['ckpt_path']).group(1))
             # Add metrics info
             metrics_df = pd.DataFrame()
@@ -79,22 +88,22 @@ class CSVExperimentSummary(Callback):
                 csv_metrics_path = output_dir / config.logger.csv.name / "metrics.csv"
                 if csv_metrics_path.is_file():
                     log.info(f"Summarizing metrics with prefix `{self.prefix}` from {csv_metrics_path}")
-                    # Use only columns that start with the specified prefix
                     metrics_df = pd.read_csv(csv_metrics_path)
-                    # Find rows where any 'test/' column is not null and reset its epoch to the best model epoch
                     test_columns = [col for col in metrics_df.columns if col.startswith('test/')]
-                    mask = metrics_df[test_columns].notna().any(axis=1)
-                    metrics_df.loc[mask, 'epoch'] = override_dict['epoch']
                     # Group and filter by best epoch
                     metrics_df = metrics_df.groupby('epoch').first()
-                    metrics_df = metrics_df[metrics_df.index == override_dict['epoch']]
                 else:
                     log.info(f"No metrics.csv found in {output_dir}")
             if metrics_df.empty:
-                metrics_df = pd.DataFrame(data=override_dict, index=[0])
             else:
-                metrics_df = metrics_df.assign(**override_dict)
                 metrics_df.index = [0]
             # Add extra info from the input batch experiment summary
@@ -102,7 +111,8 @@ class CSVExperimentSummary(Callback):
                 orig_meta = self.input_experiment_summary[
                     self.input_experiment_summary['ckpt_path'] == metrics_df['ckpt_path'][0]
                     ].head(1)
-                orig_meta.index = [0]
                 metrics_df = metrics_df.combine_first(orig_meta)
             summary_df = pd.concat([summary_df, metrics_df])
@@ -169,9 +179,8 @@ def checkpoint_rerun_config(config: DictConfig):
             ckpt_cfg.data = OmegaConf.masked_copy(ckpt_cfg.data, [
                 key for key in ckpt_cfg.data.keys() if key not in ['data_file', 'split', 'train_val_test_split']
             ])
-            ckpt_override_keys = ['task',
-                                  'data.drug_featurizer', 'data.protein_featurizer', 'data.collator',
-                                  'model.predictor']
             for key in ckpt_override_keys:
                 OmegaConf.update(config, key, OmegaConf.select(ckpt_cfg, key), force_add=True)
@@ -183,3 +192,4 @@ def checkpoint_rerun_config(config: DictConfig):
             _save_config(config, "config.yaml", hydra_output)
     return config

+from datetime import timedelta
 from pathlib import Path
 import re
+from time import time
 from typing import Any, Tuple
 import pandas as pd
+from hydra import TaskFunction
 from hydra.core.hydra_config import HydraConfig
 from hydra.core.utils import _save_config
 from hydra.experimental.callbacks import Callback
         self.filename = filename
         self.prefix = prefix if isinstance(prefix, str) else tuple(prefix)
         self.input_experiment_summary = None
+        self.time = {}
     def on_multirun_start(self, config: DictConfig, **kwargs: Any) -> None:
+        if config.hydra.get('overrides') and config.hydra.overrides.get('task'):
+            for i, override in enumerate(config.hydra.overrides.task):
+                if override.startswith("ckpt_path"):
+                    ckpt_path = override.split('=', 1)[1]
+                    if ckpt_path.endswith(('.csv', '.txt', '.tsv', '.ssv', '.psv')):
+                        config.hydra.overrides.task[i] = self.parse_ckpt_path_from_experiment_summary(ckpt_path)
+                    break
+            if config.hydra.sweeper.get('params'):
+                if config.hydra.sweeper.params.get('ckpt_path'):
+                    ckpt_path = str(config.hydra.sweeper.params.ckpt_path).strip("'\"")
+                    if ckpt_path.endswith(('.csv', '.txt', '.tsv', '.ssv', '.psv')):
+                        config.hydra.sweeper.params.ckpt_path = self.parse_ckpt_path_from_experiment_summary(ckpt_path)
+    def on_job_start(self, config: DictConfig, *, task_function: TaskFunction, **kwargs: Any) -> None:
+        self.time['start'] = time()
     def on_job_end(self, config: DictConfig, job_return, **kwargs: Any) -> None:
         # Skip callback if job is DDP subprocess
             return
         try:
+            self.time['end'] = time()
             if config.hydra.mode == RunMode.RUN:
                 summary_file_path = Path(config.hydra.run.dir) / self.filename
             elif config.hydra.mode == RunMode.MULTIRUN:
                 summary_df = pd.DataFrame()
             # Add job and override info
+            info_dict = {}
+            if job_return.overrides:
+                info_dict = dict(override.split('=', 1) for override in job_return.overrides)
+            info_dict['job_status'] = job_return.status.name
+            info_dict['job_id'] = job_return.hydra_cfg.hydra.job.id
+            info_dict['wall_time'] = str(timedelta(self.time['end'] - self.time['start']))
             # Add checkpoint info
+            if info_dict.get('ckpt_path'):
+                info_dict['ckpt_path'] = str(info_dict['ckpt_path']).strip("'\"")
+            ckpt_path = str(job_return.cfg.ckpt_path).strip("'\"")
+            if Path(ckpt_path).is_file():
+                if info_dict.get('ckpt_path') and ckpt_path != info_dict['ckpt_path']:
+                    info_dict['previous_ckpt_path'] = info_dict['ckpt_path']
+                info_dict['ckpt_path'] = ckpt_path
+            info_dict['best_epoch'] = int(re.search(r'epoch_(\d+)', info_dict['ckpt_path']).group(1))
             # Add metrics info
             metrics_df = pd.DataFrame()
                 csv_metrics_path = output_dir / config.logger.csv.name / "metrics.csv"
                 if csv_metrics_path.is_file():
                     log.info(f"Summarizing metrics with prefix `{self.prefix}` from {csv_metrics_path}")
                     metrics_df = pd.read_csv(csv_metrics_path)
+                    # Find rows where 'test/' columns are not null and reset its epoch to the best model epoch
                     test_columns = [col for col in metrics_df.columns if col.startswith('test/')]
+                    if test_columns:
+                        mask = metrics_df[test_columns].notna().any(axis=1)
+                        metrics_df.loc[mask, 'epoch'] = info_dict['best_epoch']
                     # Group and filter by best epoch
                     metrics_df = metrics_df.groupby('epoch').first()
+                    metrics_df = metrics_df[metrics_df.index == info_dict['best_epoch']]
                 else:
                     log.info(f"No metrics.csv found in {output_dir}")
             if metrics_df.empty:
+                metrics_df = pd.DataFrame(data=info_dict, index=[0])
             else:
+                metrics_df = metrics_df.assign(**info_dict)
                 metrics_df.index = [0]
             # Add extra info from the input batch experiment summary
                 orig_meta = self.input_experiment_summary[
                     self.input_experiment_summary['ckpt_path'] == metrics_df['ckpt_path'][0]
                     ].head(1)
+                if not orig_meta.empty:
+                    orig_meta.index = [0]
                 metrics_df = metrics_df.combine_first(orig_meta)
             summary_df = pd.concat([summary_df, metrics_df])
             ckpt_cfg.data = OmegaConf.masked_copy(ckpt_cfg.data, [
                 key for key in ckpt_cfg.data.keys() if key not in ['data_file', 'split', 'train_val_test_split']
             ])
+            ckpt_override_keys = ['task', 'data.drug_featurizer', 'data.protein_featurizer', 'data.collator',
+                                  'model.predictor', 'model.out', 'model.loss', 'model.activation', 'model.metrics']
             for key in ckpt_override_keys:
                 OmegaConf.update(config, key, OmegaConf.select(ckpt_cfg, key), force_add=True)
             _save_config(config, "config.yaml", hydra_output)
     return config

resources/vocabs/drug_vqa/combinedVoc-wholeFour.voc CHANGED Viewed

@@ -1,4 +1,3 @@
-[PAD]
 [102Ru]
 [80Se]
 [N-]

 [102Ru]
 [80Se]
 [N-]