dependencies and embedding_exploration benchmark

Files changed (18) hide show

fuson_plm/README.md +588 -0
fuson_plm/benchmarking/README.md +11 -0
fuson_plm/benchmarking/__init__.py +0 -0
fuson_plm/benchmarking/embed.py +296 -0
fuson_plm/benchmarking/embedding_exploration/README.md +58 -0
fuson_plm/benchmarking/embedding_exploration/__init__.py +0 -0
fuson_plm/benchmarking/embedding_exploration/config.py +10 -0
fuson_plm/benchmarking/embedding_exploration/data/salokas_2020_tableS3.csv +3 -0
fuson_plm/benchmarking/embedding_exploration/data/tf_and_kinase_fusions.csv +3 -0
fuson_plm/benchmarking/embedding_exploration/data/top_genes.csv +3 -0
fuson_plm/benchmarking/embedding_exploration/plot.py +496 -0
fuson_plm/benchmarking/embedding_exploration/results/final/umap_plots/fuson_plm/best/favorites/umap_favorites_source_data.csv +3 -0
fuson_plm/benchmarking/embedding_exploration/results/final/umap_plots/fuson_plm/best/favorites/umap_favorites_visualization.png +0 -0
fuson_plm/benchmarking/embedding_exploration/results/final/umap_plots/fuson_plm/best/tf_and_kinase/umap_tf_and_kinase_fusions_source_data.csv +3 -0
fuson_plm/benchmarking/embedding_exploration/results/final/umap_plots/fuson_plm/best/tf_and_kinase/umap_tf_and_kinase_fusions_visualization.png +0 -0
fuson_plm/benchmarking/mutation_prediction/README.md +1 -1
fuson_plm/benchmarking/puncta/train.py +1 -1
fuson_plm/benchmarking/xgboost_predictor.py +65 -0

fuson_plm/README.md ADDED Viewed

	@@ -0,0 +1,588 @@

+# Dependencies
+Here we provied package versions needed to run FusOn-pLM code. For the project, Docker containers were used. We provide a pip list of what is inside the Docker container, as well as the images used for our containers.
+## pip installs
+The following dependencies were used for all training and benchmarking except for the `puncta` benchmarks.
+Note that after cloning the repository, you will need to run `pip install e .` outside the `fuson_plm` directory to install `fuson_plm` package.
+Package                   Version              Editable project location
+------------------------- -------------------- -------------------------
+absl-py                   1.4.0
+aiohttp                   3.8.4
+aiosignal                 1.3.1
+apex                      0.1
+argon2-cffi               21.3.0
+argon2-cffi-bindings      21.2.0
+asttokens                 2.2.1
+astunparse                1.6.3
+async-timeout             4.0.2
+attrs                     23.1.0
+audioread                 3.0.0
+backcall                  0.2.0
+beautifulsoup4            4.12.2
+bio                       1.7.1
+biopython                 1.84
+biothings-client          0.3.1
+bleach                    6.0.0
+blis                      0.7.10
+cachetools                5.3.1
+catalogue                 2.0.9
+certifi                   2023.7.22
+cffi                      1.15.1
+charset-normalizer        3.2.0
+click                     8.1.5
+cloudpickle               2.2.1
+cmake                     3.27.1
+comm                      0.1.4
+confection                0.1.1
+contourpy                 1.1.0
+cubinlinker               0.3.0+2.g7c3675e
+cuda-python               12.1.0rc5+1.g994d8d0
+cudf                      23.6.0
+cugraph                   23.6.0
+cugraph-dgl               23.6.0
+cugraph-service-client    23.6.0
+cugraph-service-server    23.6.0
+cuml                      23.6.0
+cupy-cuda12x              12.1.0
+cycler                    0.11.0
+cymem                     2.0.7
+Cython                    3.0.0
+dask                      2023.3.2
+dask-cuda                 23.6.0
+dask-cudf                 23.6.0
+debugpy                   1.6.7
+decorator                 5.1.1
+defusedxml                0.7.1
+distributed               2023.3.2.1
+dm-tree                   0.1.8
+docker-pycreds            0.4.0
+einops                    0.6.1
+exceptiongroup            1.1.2
+execnet                   2.0.2
+executing                 1.2.0
+expecttest                0.1.3
+fair-esm                  2.0.0
+fastjsonschema            2.18.0
+fastrlock                 0.8.1
+filelock                  3.12.2
+flash-attn                2.0.4
+fonttools                 4.42.0
+frozenlist                1.4.0
+fsspec                    2023.6.0
+fuson-plm                 1.0                  /workspace/FusOn-pLM
+gast                      0.5.4
+gdown                     5.2.0
+gitdb                     4.0.11
+GitPython                 3.1.43
+google-auth               2.22.0
+google-auth-oauthlib      0.4.6
+gprofiler-official        1.0.0
+graphsurgeon              0.4.6
+grpcio                    1.56.2
+huggingface-hub           0.25.2
+hypothesis                5.35.1
+idna                      3.4
+importlib-metadata        6.8.0
+iniconfig                 2.0.0
+intel-openmp              2021.4.0
+ipykernel                 6.25.0
+ipython                   8.14.0
+ipython-genutils          0.2.0
+jedi                      0.19.0
+Jinja2                    3.1.2
+joblib                    1.3.1
+json5                     0.9.14
+jsonschema                4.18.6
+jsonschema-specifications 2023.7.1
+jupyter_client            8.3.0
+jupyter_core              5.3.1
+jupyter-tensorboard       0.2.0
+jupyterlab                2.3.2
+jupyterlab-pygments       0.2.2
+jupyterlab-server         1.2.0
+jupytext                  1.15.0
+kiwisolver                1.4.4
+langcodes                 3.3.0
+librosa                   0.9.2
+lightning-utilities       0.11.8
+llvmlite                  0.40.1
+locket                    1.0.0
+Markdown                  3.4.4
+markdown-it-py            3.0.0
+MarkupSafe                2.1.3
+matplotlib                3.7.2
+matplotlib-inline         0.1.6
+mdit-py-plugins           0.4.0
+mdurl                     0.1.2
+mistune                   3.0.1
+mkl                       2021.1.1
+mkl-devel                 2021.1.1
+mkl-include               2021.1.1
+mock                      5.1.0
+mpmath                    1.3.0
+msgpack                   1.0.5
+multidict                 6.0.4
+murmurhash                1.0.9
+mygene                    3.2.2
+nbclient                  0.8.0
+nbconvert                 7.7.3
+nbformat                  5.9.2
+nest-asyncio              1.5.7
+networkx                  2.6.3
+ninja                     1.11.1
+notebook                  6.4.10
+numba                     0.57.1+1.gc785c8f1f
+numpy                     1.22.2
+nvidia-cublas-cu12        12.4.5.8
+nvidia-cuda-cupti-cu12    12.4.127
+nvidia-cuda-nvrtc-cu12    12.4.127
+nvidia-cuda-runtime-cu12  12.4.127
+nvidia-cudnn-cu12         9.1.0.70
+nvidia-cufft-cu12         11.2.1.3
+nvidia-curand-cu12        10.3.5.147
+nvidia-cusolver-cu12      11.6.1.9
+nvidia-cusparse-cu12      12.3.1.170
+nvidia-dali-cuda120       1.28.0
+nvidia-nccl-cu12          2.21.5
+nvidia-nvjitlink-cu12     12.4.127
+nvidia-nvtx-cu12          12.4.127
+nvidia-pyindex            1.0.9
+nvtx                      0.2.5
+oauthlib                  3.2.2
+onnx                      1.14.0
+opencv                    4.7.0
+packaging                 23.1
+pandas                    1.5.2
+pandocfilters             1.5.0
+parso                     0.8.3
+partd                     1.4.0
+pathy                     0.10.2
+pexpect                   4.8.0
+pickleshare               0.7.5
+Pillow                    9.2.0
+pip                       23.2.1
+platformdirs              3.10.0
+pluggy                    1.2.0
+ply                       3.11
+polygraphy                0.47.1
+pooch                     1.7.0
+preshed                   3.0.8
+prettytable               3.8.0
+prometheus-client         0.17.1
+prompt-toolkit            3.0.39
+protobuf                  4.21.12
+psutil                    5.9.4
+ptxcompiler               0.8.1+1.g4a94326
+ptyprocess                0.7.0
+pure-eval                 0.2.2
+py3Dmol                   2.4.0
+pyarrow                   11.0.0
+pyasn1                    0.5.0
+pyasn1-modules            0.3.0
+pybind11                  2.11.1
+pycocotools               2.0+nv0.7.3
+pycparser                 2.21
+pydantic                  1.10.12
+Pygments                  2.16.1
+pylibcugraph              23.6.0
+pylibcugraphops           23.6.0
+pylibraft                 23.6.0
+pynndescent               0.5.13
+pynvml                    11.4.1
+pyparsing                 3.0.9
+PySocks                   1.7.1
+pytest                    7.4.0
+pytest-flakefinder        1.1.0
+pytest-rerunfailures      12.0
+pytest-shard              0.1.2
+pytest-xdist              3.3.1
+python-dateutil           2.8.2
+python-hostlist           1.23.0
+pytorch-lightning         2.4.0
+pytorch-quantization      2.1.2
+pytz                      2023.3
+PyYAML                    6.0.1
+pyzmq                     25.1.0
+raft-dask                 23.6.0
+referencing               0.30.2
+regex                     2023.6.3
+requests                  2.31.0
+requests-oauthlib         1.3.1
+resampy                   0.4.2
+rmm                       23.6.0
+rpds-py                   0.9.2
+rsa                       4.9
+safetensors               0.4.5
+scikit-learn              1.2.0
+scipy                     1.11.1
+seaborn                   0.13.2
+Send2Trash                1.8.2
+sentencepiece             0.2.0
+sentry-sdk                2.16.0
+setproctitle              1.3.3
+setuptools                68.0.0
+six                       1.16.0
+smart-open                6.3.0
+smmap                     5.0.1
+sortedcontainers          2.4.0
+soundfile                 0.12.1
+soupsieve                 2.4.1
+spacy                     3.6.0
+spacy-legacy              3.0.12
+spacy-loggers             1.0.4
+sphinx-glpi-theme         0.3
+srsly                     2.4.7
+stack-data                0.6.2
+sympy                     1.13.1
+tabulate                  0.9.0
+tbb                       2021.10.0
+tblib                     2.0.0
+tensorboard               2.9.0
+tensorboard-data-server   0.6.1
+tensorboard-plugin-wit    1.8.1
+tensorrt                  8.6.1
+terminado                 0.17.1
+thinc                     8.1.10
+threadpoolctl             3.2.0
+thriftpy2                 0.4.16
+tinycss2                  1.2.1
+tokenizers                0.20.1
+toml                      0.10.2
+tomli                     2.0.1
+toolz                     0.12.0
+torch                     2.5.0
+torch-tensorrt            2.0.0.dev0
+torchdata                 0.7.0a0
+torchmetrics              1.5.0
+torchtext                 0.16.0a0
+torchvision               0.16.0a0
+tornado                   6.3.2
+tqdm                      4.65.0
+traitlets                 5.9.0
+transformer-engine        0.11.0+3f01b4f
+transformers              4.45.2
+treelite                  3.2.0
+treelite-runtime          3.2.0
+triton                    3.1.0
+typer                     0.9.0
+types-dataclasses         0.6.6
+typing_extensions         4.12.2
+ucx-py                    0.32.0
+uff                       0.6.9
+umap-learn                0.5.6
+urllib3                   1.26.16
+wandb                     0.18.3
+wasabi                    1.1.2
+wcwidth                   0.2.6
+webencodings              0.5.1
+Werkzeug                  2.3.6
+wheel                     0.41.1
+xdoctest                  1.0.2
+xgboost                   1.7.5
+yarl                      1.9.2
+zict                      3.0.0
+zipp                      3.16.2
+The following packages and versions were used for the `puncta` benchmarks. A different environment was required to run ProtT5.
+Package                   Version                    Editable project location
+------------------------- -------------------------- -------------------------
+absl-py                   2.1.0
+aiohttp                   3.9.3
+aiosignal                 1.3.1
+annotated-types           0.6.0
+anyio                     4.8.0
+apex                      0.1
+argon2-cffi               23.1.0
+argon2-cffi-bindings      21.2.0
+asttokens                 2.4.1
+astunparse                1.6.3
+async-timeout             4.0.3
+attrs                     23.2.0
+audioread                 3.0.1
+beautifulsoup4            4.12.3
+bio                       1.7.1
+biopython                 1.85
+biothings_client          0.4.1
+bleach                    6.1.0
+blis                      0.7.11
+cachetools                5.3.3
+catalogue                 2.0.10
+certifi                   2024.2.2
+cffi                      1.16.0
+charset-normalizer        3.3.2
+click                     8.1.7
+cloudpathlib              0.16.0
+cloudpickle               3.0.0
+cmake                     3.29.0.1
+comm                      0.2.2
+confection                0.1.4
+contourpy                 1.2.1
+cuda-python               12.4.0rc7+3.ge75c8a9.dirty
+cudf                      24.2.0
+cudnn                     1.1.2
+cugraph                   24.2.0
+cugraph-dgl               24.2.0
+cugraph-service-client    24.2.0
+cugraph-service-server    24.2.0
+cuml                      24.2.0
+cupy-cuda12x              13.0.0
+cycler                    0.12.1
+cymem                     2.0.8
+Cython                    3.0.10
+dask                      2024.1.1
+dask-cuda                 24.2.0
+dask-cudf                 24.2.0
+debugpy                   1.8.1
+decorator                 5.1.1
+defusedxml                0.7.1
+distributed               2024.1.1
+dm-tree                   0.1.8
+docker-pycreds            0.4.0
+einops                    0.7.0
+exceptiongroup            1.2.0
+execnet                   2.0.2
+executing                 2.0.1
+expecttest                0.1.3
+fair-esm                  2.0.0
+fastjsonschema            2.19.1
+fastrlock                 0.8.2
+filelock                  3.13.3
+flash-attn                2.4.2
+fonttools                 4.51.0
+frozenlist                1.4.1
+fsspec                    2024.2.0
+fuson-plm                 1.0                        /workspace/FusOn-pLM
+gast                      0.5.4
+gdown                     5.2.0
+gitdb                     4.0.12
+GitPython                 3.1.44
+google-auth               2.29.0
+google-auth-oauthlib      0.4.6
+gprofiler-official        1.0.0
+graphsurgeon              0.4.6
+grpcio                    1.62.1
+h11                       0.14.0
+httpcore                  1.0.7
+httpx                     0.28.1
+huggingface-hub           0.27.1
+hypothesis                5.35.1
+idna                      3.6
+igraph                    0.11.4
+importlib_metadata        7.0.2
+iniconfig                 2.0.0
+intel-openmp              2021.4.0
+ipykernel                 6.29.4
+ipython                   8.21.0
+ipython-genutils          0.2.0
+jedi                      0.19.1
+Jinja2                    3.1.3
+joblib                    1.3.2
+json5                     0.9.24
+jsonschema                4.21.1
+jsonschema-specifications 2023.12.1
+jupyter_client            8.6.1
+jupyter_core              5.7.2
+jupyter-tensorboard       0.2.0
+jupyterlab                2.3.2
+jupyterlab_pygments       0.3.0
+jupyterlab-server         1.2.0
+jupytext                  1.16.1
+kiwisolver                1.4.5
+langcodes                 3.3.0
+lark                      1.1.9
+lazy_loader               0.4
+librosa                   0.10.1
+lightning-thunder         0.1.0
+lightning-utilities       0.11.2
+llvmlite                  0.42.0
+locket                    1.0.0
+looseversion              1.3.0
+Markdown                  3.6
+markdown-it-py            3.0.0
+MarkupSafe                2.1.5
+matplotlib                3.8.4
+matplotlib-inline         0.1.6
+mdit-py-plugins           0.4.0
+mdurl                     0.1.2
+mistune                   3.0.2
+mkl                       2021.1.1
+mkl-devel                 2021.1.1
+mkl-include               2021.1.1
+mock                      5.1.0
+mpmath                    1.3.0
+msgpack                   1.0.8
+multidict                 6.0.5
+murmurhash                1.0.10
+mygene                    3.2.2
+nbclient                  0.10.0
+nbconvert                 7.16.3
+nbformat                  5.10.4
+nest-asyncio              1.6.0
+networkx                  2.6.3
+ninja                     1.11.1.1
+notebook                  6.4.10
+numba                     0.59.0+1.g20ae2b56c
+numpy                     1.24.4
+nvfuser                   0.1.6a0+a684e2a
+nvidia-dali-cuda120       1.36.0
+nvidia-nvimgcodec-cu12    0.2.0.7
+nvidia-pyindex            1.0.9
+nvtx                      0.2.5
+oauthlib                  3.2.2
+onnx                      1.16.0
+opencv                    4.7.0
+opt-einsum                3.3.0
+optree                    0.11.0
+packaging                 23.2
+pandas                    1.5.3
+pandocfilters             1.5.1
+parso                     0.8.4
+partd                     1.4.1
+pexpect                   4.9.0
+pillow                    10.2.0
+pip                       24.0
+platformdirs              4.2.0
+pluggy                    1.4.0
+ply                       3.11
+polygraphy                0.49.8
+pooch                     1.8.1
+preshed                   3.0.9
+prettytable               3.10.0
+prometheus_client         0.20.0
+prompt-toolkit            3.0.43
+protobuf                  4.24.4
+psutil                    5.9.4
+ptyprocess                0.7.0
+pure-eval                 0.2.2
+py3Dmol                   2.4.2
+pyarrow                   14.0.1
+pyasn1                    0.6.0
+pyasn1_modules            0.4.0
+pybind11                  2.12.0
+pybind11_global           2.12.0
+pycocotools               2.0+nv0.8.0
+pycparser                 2.22
+pydantic                  2.6.4
+pydantic_core             2.16.3
+Pygments                  2.17.2
+pylibcugraph              24.2.0
+pylibcugraphops           24.2.0
+pylibraft                 24.2.0
+pynndescent               0.5.13
+pynvjitlink               0.1.13
+pynvml                    11.4.1
+pyparsing                 3.1.2
+PySocks                   1.7.1
+pytest                    8.1.1
+pytest-flakefinder        1.1.0
+pytest-rerunfailures      14.0
+pytest-shard              0.1.2
+pytest-xdist              3.5.0
+python-dateutil           2.9.0.post0
+python-hostlist           1.23.0
+pytorch-lightning         2.5.0.post0
+pytorch-quantization      2.1.2
+pytorch-triton            3.0.0+a9bc1a364
+pytz                      2024.1
+PyYAML                    6.0.1
+pyzmq                     25.1.2
+raft-dask                 24.2.0
+rapids-dask-dependency    24.2.0a0
+referencing               0.34.0
+regex                     2023.12.25
+requests                  2.31.0
+requests-oauthlib         2.0.0
+rich                      13.7.1
+rmm                       24.2.0
+rpds-py                   0.18.0
+rsa                       4.9
+safetensors               0.5.2
+scikit-learn              1.2.0
+scipy                     1.12.0
+seaborn                   0.13.2
+Send2Trash                1.8.2
+sentencepiece             0.2.0
+sentry-sdk                2.20.0
+setproctitle              1.3.4
+setuptools                68.2.2
+six                       1.16.0
+smart-open                6.4.0
+smmap                     5.0.2
+sniffio                   1.3.1
+sortedcontainers          2.4.0
+soundfile                 0.12.1
+soupsieve                 2.5
+soxr                      0.3.7
+spacy                     3.7.4
+spacy-legacy              3.0.12
+spacy-loggers             1.0.5
+sphinx_glpi_theme         0.6
+srsly                     2.4.8
+stack-data                0.6.3
+sympy                     1.12
+tabulate                  0.9.0
+tbb                       2021.12.0
+tblib                     3.0.0
+tensorboard               2.9.0
+tensorboard-data-server   0.6.1
+tensorboard-plugin-wit    1.8.1
+tensorrt                  8.6.3
+terminado                 0.18.1
+texttable                 1.7.0
+thinc                     8.2.3
+threadpoolctl             3.3.0
+thriftpy2                 0.4.17
+tinycss2                  1.2.1
+tokenizers                0.21.0
+toml                      0.10.2
+tomli                     2.0.1
+toolz                     0.12.1
+torch                     2.3.0a0+6ddf5cf85e.nv24.4
+torch-tensorrt            2.3.0a0
+torchdata                 0.7.1a0
+torchmetrics              1.6.1
+torchtext                 0.17.0a0
+torchvision               0.18.0a0
+tornado                   6.4
+tqdm                      4.66.2
+traitlets                 5.9.0
+transformer-engine        1.5.0+6a9edc3
+transformers              4.48.0
+treelite                  4.0.0
+typer                     0.9.4
+types-dataclasses         0.6.6
+typing_extensions         4.10.0
+ucx-py                    0.36.0
+uff                       0.6.9
+umap-learn                0.5.7
+urllib3                   1.26.18
+wandb                     0.19.4
+wasabi                    1.1.2
+wcwidth                   0.2.13
+weasel                    0.3.4
+webencodings              0.5.1
+Werkzeug                  3.0.2
+wheel                     0.43.0
+xdoctest                  1.0.2
+xgboost                   1.7.5
+yarl                      1.9.4
+zict                      3.0.0
+zipp                      3.17.0
+## Docker
+The following image was used for Container 1 (all code except puncta benchmark):
+```
+nvcr.io/nvidia/pytorch:23.08-py3
+```
+The following image was used for Container 2 (puncta benchmark):
+```
+nvcr.io/nvidia/pytorch:24.04-py3
+```

fuson_plm/benchmarking/README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+# Benchmarking
+This outer directory for the benchmarks in FusOn-pLM has some utility functions stored in `.py` files.
+### embed.py
+This file contains functions used to make and organize FusOn-pLM and ESM embeddings of benchmarking datasets. Its functions are used in all benchmarks.
+### xgboost_predictor.py
+This file contains functions used to train XGBoost predictors, which are utilized in the `puncta` benchmark.

fuson_plm/benchmarking/__init__.py ADDED Viewed

File without changes

fuson_plm/benchmarking/embed.py ADDED Viewed

	@@ -0,0 +1,296 @@

+# Python file for making embeddings from a FusOn-pLM model for any dataset
+from fuson_plm.utils.embedding import get_esm_embeddings, load_esm2_type, redump_pickle_dictionary, load_prott5, get_prott5_embeddings
+from fuson_plm.utils.logging import log_update, open_logfile, print_configpy
+from fuson_plm.utils.data_cleaning import find_invalid_chars
+from fuson_plm.utils.constants import VALID_AAS
+from fuson_plm.training.model import FusOnpLM
+from transformers import AutoModelForMaskedLM, AutoTokenizer, AutoModel
+import logging
+import torch
+import pickle
+import os
+import pandas as pd
+import numpy as np
+def validate_sequence_col(df, seq_col):
+    # if column isn't there, error
+    if seq_col not in list(df.columns):
+        raise Exception("Error: provided sequence column does not exist in the input dataframe")
+    # if column contains invalid characters, error
+    df['invalid_chars'] = df[seq_col].apply(lambda x: find_invalid_chars(x, VALID_AAS))
+    all_invalid_chars = set().union(*df['invalid_chars'])
+    df = df.drop(columns=['invalid_chars'])
+    if len(all_invalid_chars)>0:
+        raise Exception(f"Error: invalid characters {all_invalid_chars} found in the sequence column")
+    # make sure there are no duplicates
+    sequences = df[seq_col]
+    if len(set(sequences))<len(sequences): log_update("\tWARNING: input data has duplicate sequences")
+def load_fuson_model(ckpt_path):
+    # Suppress warnings about newly initialized 'esm.pooler.dense.bias', 'esm.pooler.dense.weight' layers - these are not used to extract embeddings
+    logging.getLogger("transformers.modeling_utils").setLevel(logging.ERROR)
+    # Set device
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    print(f"Using device: {device}")
+    # Load model
+    model = AutoModel.from_pretrained(ckpt_path)              # initialize model
+    tokenizer = AutoTokenizer.from_pretrained(ckpt_path)      # initialize tokenizer
+    # Model to device and in eval mode
+    model.to(device)
+    model.eval()  # disables dropout for deterministic results
+    return model, tokenizer, device
+def get_fuson_embeddings(model, tokenizer, sequences, device, average=True, print_updates=False, savepath=None, save_at_end=False, max_length=2000):
+    # Correct save path to pickle if necessary
+    if savepath is not None:
+        if savepath[-4::] != '.pkl': savepath += '.pkl'
+    if print_updates: log_update(f"Dataset contains {len(sequences)} sequences.")
+    # If no max length was passed, just set it to the maximum in the dataset
+    max_seq_len = max([len(s) for s in sequences])
+    if max_length is None: max_length=max_seq_len+2 # add 2 for BOS, EOS
+    # Initialize an empty dict to store the ESM embeddings
+    embedding_dict = {}
+    # Iterate through the seqs
+    for i in range(len(sequences)):
+        sequence = sequences[i]
+        # Get the embeddings
+        with torch.no_grad():
+            # Tokenize the input sequence
+            inputs = tokenizer(sequence, return_tensors="pt", padding=True, truncation=True,max_length=max_length)
+            inputs = {k: v.to(device) for k, v in inputs.items()}
+            outputs = model(**inputs)
+            # The embeddings are in the last_hidden_state tensor
+            embedding = outputs.last_hidden_state
+            # remove extra dimension
+            embedding = embedding.squeeze(0)
+            # remove BOS and EOS tokens
+            embedding = embedding[1:-1, :]
+            # Convert embeddings to numpy array (if needed)
+            embedding = embedding.cpu().numpy()
+         # Average (if necessary)
+        if average:
+            embedding = embedding.mean(0)
+        # Add to dictionary
+        embedding_dict[sequence] = embedding
+        # Save individual embedding (if necessary)
+        if not(savepath is None) and not(save_at_end):
+            with open(savepath, 'ab+') as f:
+                d = {sequence: embedding}
+                pickle.dump(d, f)
+        # Print update (if necessary)
+        if print_updates: log_update(f"sequence {i+1}: {sequence[0:10]}...")
+    # Dump all at once at the end (if necessary)
+    if not(savepath is None):
+        # If saving for the first time, just dump it
+        if save_at_end:
+            with open(savepath, 'wb') as f:
+                pickle.dump(embedding_dict, f)
+        # If we've been saving all along and made it here without crashing, correct the pickle file so it can be loaded nicely
+        else:
+            redump_pickle_dictionary(savepath)
+def embed_dataset(path_to_file, path_to_output, seq_col='aa_seq', model_type='fuson_plm', fuson_ckpt_path = None, average=True, overwrite=True, print_updates=False,max_length=2000):
+    # Make sure we aren't overwriting pre-existing embeddings
+    if os.path.exists(path_to_output):
+        if overwrite:
+            log_update(f"WARNING: these embeddings may already exist at {path_to_output} and will be overwritten")
+        else:
+            log_update(f"WARNING: these embeddings may already exist at {path_to_output}. Skipping.")
+            return None
+    dataset = pd.read_csv(path_to_file)
+    # Make sure the sequence column is valid
+    validate_sequence_col(dataset, seq_col)
+    sequences = dataset[seq_col].unique().tolist() # ensure all entries are unique
+    ### If FusOn-pLM: make fusion embeddings
+    if model_type=='fuson_plm':
+        if not(os.path.exists(fuson_ckpt_path)): raise Exception("FusOn-pLM ckpt path does not exist")
+        # Load model
+        try:
+            model, tokenizer, device = load_fuson_model(fuson_ckpt_path)
+        except:
+            raise Exception(f"Could not load FusOn-pLM from {fuson_ckpt_path}")
+        # Generate embeddigns
+        try:
+            get_fuson_embeddings(model, tokenizer, sequences, device, average=average,
+                                 print_updates=print_updates, savepath=path_to_output, save_at_end=False,
+                                 max_length=max_length)
+        except:
+            raise Exception("Could not generate FusOn-pLM embeddings")
+    if model_type=='esm2_t33_650M_UR50D':
+        # Load model
+        try:
+            model, tokenizer, device = load_esm2_type(model_type)
+        except:
+            raise Exception(f"Could not load {model_type}")
+        # Generate embeddings
+        try:
+            get_esm_embeddings(model, tokenizer, sequences, device, average=average,
+                               print_updates=print_updates, savepath=path_to_output, save_at_end=False,
+                               max_length=max_length)
+        except:
+            raise Exception(f"Could not generate {model_type} embeddings")
+    if model_type=="prot_t5_xl_half_uniref50_enc":
+        # Load model
+        try:
+            model, tokenizer, device = load_prott5()
+        except:
+            raise Exception(f"Could not load {model_type}")
+        # Generate embeddings
+        try:
+            get_prott5_embeddings(model, tokenizer, sequences, device, average=average,
+                               print_updates=print_updates, savepath=path_to_output, save_at_end=False,
+                               max_length=max_length)
+        except:
+            raise Exception(f"Could not generate {model_type} embeddings")
+def embed_dataset_for_benchmark(fuson_ckpts=None, input_data_path=None, input_fname=None, average=True, seq_col='seq', benchmark_fusonplm=False, benchmark_esm=False, benchmark_fo_puncta_ml=False, benchmark_prott5=False, overwrite=False,max_length=None):
+    # make directory for embeddings inside benchmarking dataset if one doesn't already eist
+    os.makedirs('embeddings',exist_ok=True)
+    # Extract input file name from configs
+    emb_type_tag ='average' if average else '2D'
+    all_embedding_paths = dict() # dictionary organized where embedding path points to model, epoch
+    # make the embedding files. Put them in an embedding directory
+    if benchmark_fusonplm:
+        os.makedirs('embeddings/fuson_plm',exist_ok=True)
+        log_update(f"\nMaking Fuson-PLM embeddings")
+        # make subdirs for all the
+        if type(fuson_ckpts)==dict:
+            for model_name, epoch_list in fuson_ckpts.items():
+                os.makedirs(f'embeddings/fuson_plm/{model_name}',exist_ok=True)
+                for epoch in epoch_list:
+                    # Assemble ckpt path and throw error if it doesn't exist
+                    fuson_ckpt_path = f'../../training/checkpoints/{model_name}/checkpoint_epoch_{epoch}'
+                    if not(os.path.exists(fuson_ckpt_path)): raise Exception(f"Error. Cannot find ckpt path: {fuson_ckpt_path}")
+                    # Make output directory and output embedding path
+                    embedding_output_dir = f'embeddings/fuson_plm/{model_name}/epoch{epoch}'
+                    embedding_output_path = f'{embedding_output_dir}/{input_fname}_{emb_type_tag}_embeddings.pkl'
+                    os.makedirs(embedding_output_dir,exist_ok=True)
+                    # Make dictionary item
+                    model_type = 'fuson_plm'
+                    all_embedding_paths[embedding_output_path] = {
+                        'model_type': model_type,
+                        'model': model_name,
+                        'epoch': epoch
+                    }
+                    # Create embeddings (or skip if they're already made)
+                    log_update(f"\tUsing ckpt {fuson_ckpt_path} and saving results to {embedding_output_path}...")
+                    embed_dataset(input_data_path, embedding_output_path,
+                                seq_col=seq_col, model_type=model_type,
+                                fuson_ckpt_path=fuson_ckpt_path, average=average,
+                                overwrite=overwrite,print_updates=True,
+                                max_length=max_length)
+        elif fuson_ckpts=="FusOn-pLM":
+            model_name = "best"
+            os.makedirs(f'embeddings/fuson_plm/{model_name}',exist_ok=True)
+            # Assemble ckpt path and throw error if it doesn't exist
+            fuson_ckpt_path = "../../.." # go back to the FusOn-pLM directory to find the best ckpt
+            if not(os.path.exists(fuson_ckpt_path)): raise Exception(f"Error. Cannot find ckpt path: {fuson_ckpt_path}")
+            # Make output directory and output embedding path
+            embedding_output_dir = f'embeddings/fuson_plm/{model_name}'
+            embedding_output_path = f'{embedding_output_dir}/{input_fname}_{emb_type_tag}_embeddings.pkl'
+            os.makedirs(embedding_output_dir,exist_ok=True)
+            # Make dictionary item
+            model_type = 'fuson_plm'
+            all_embedding_paths[embedding_output_path] = {
+                'model_type': model_type,
+                'model': model_name,
+                'epoch': None
+            }
+            # Create embeddings (or skip if they're already made)
+            log_update(f"\tUsing ckpt {fuson_ckpt_path} and saving results to {embedding_output_path}...")
+            embed_dataset(input_data_path, embedding_output_path,
+                        seq_col=seq_col, model_type=model_type,
+                        fuson_ckpt_path=fuson_ckpt_path, average=average,
+                        overwrite=overwrite,print_updates=True,
+                        max_length=max_length)
+        else:
+            raise Exception(f"Error. fuson_ckpts should be a dict or str")
+    # make the embedding files. Put them in an embedding directory
+    if benchmark_esm:
+        os.makedirs('embeddings/esm2_t33_650M_UR50D',exist_ok=True)
+        # make output path
+        embedding_output_path = f'embeddings/esm2_t33_650M_UR50D/{input_fname}_{emb_type_tag}_embeddings.pkl'
+        # Make dictioary item
+        model_type = 'esm2_t33_650M_UR50D'
+        all_embedding_paths[embedding_output_path] = {
+                    'model_type': model_type,
+                    'model': model_type,
+                    'epoch': np.nan
+                }
+        log_update(f"\nMaking ESM-2-650M embeddings for {input_data_path} and saving results to {embedding_output_path}...")
+        embed_dataset(input_data_path, embedding_output_path,
+                    seq_col=seq_col, model_type=model_type,
+                    fuson_ckpt_path = None, average=average,
+                    overwrite=overwrite,print_updates=True,
+                    max_length=max_length)
+    if benchmark_prott5:
+        os.makedirs('embeddings/prot_t5_xl_half_uniref50_enc',exist_ok=True)
+        # make output path
+        embedding_output_path = f'embeddings/prot_t5_xl_half_uniref50_enc/{input_fname}_{emb_type_tag}_embeddings.pkl'
+        # Make dictioary item
+        model_type = 'prot_t5_xl_half_uniref50_enc'
+        all_embedding_paths[embedding_output_path] = {
+                    'model_type': model_type,
+                    'model': model_type,
+                    'epoch': np.nan
+                }
+        log_update(f"\nMaking ProtT5-XL-UniRef50 embeddings for {input_data_path} and saving results to {embedding_output_path}...")
+        embed_dataset(input_data_path, embedding_output_path,
+                    seq_col=seq_col, model_type=model_type,
+                    fuson_ckpt_path = None, average=average,
+                    overwrite=overwrite,print_updates=True,
+                    max_length=max_length)
+    if benchmark_fo_puncta_ml:
+        embedding_output_path =f'FOdb_physicochemical_embeddings.pkl'
+        # Make dictionary item
+        all_embedding_paths[embedding_output_path] = {
+                    'model_type': 'fo_puncta_ml',
+                    'model': 'fo_puncta_ml',
+                    'epoch': np.nan
+                }
+    return all_embedding_paths

fuson_plm/benchmarking/embedding_exploration/README.md ADDED Viewed

	@@ -0,0 +1,58 @@

+# Embedding exploration
+This folder contains all the data and code needed to run embedding exploration (Fig. S3).
+### Data download
+To help select TF (transcription factor) and Kinase-containing fusions for investigation (Fig. S3a), Supplementary Table 3 from [Salokas et al. 2020](https://doi.org/10.1038/s41598-020-71040-8) was downloaded as a reference of transcription factors and kinases.
+```
+benchmarking/
+└── embedding_exploration/
+    └── data/
+        ├── salokas_2020_tableS3.csv
+        ├── tf_and_kinase_fusions.csv
+        ├── top_genes.csv
+```
+- **`data/salokas_2020_tableS3.csv`**: Supplementary Table 3 from [Salokas et al. 2020](https://doi.org/10.1038/s41598-020-71040-8)
+- **`data/tf_and_kinase_fusions.csv`**: set of TF::TF and Kinase::Kinase fusion oncoproteins from FusOn-DB database. Curated in `plot.py`
+- **`data/top_genes.csv`**: fusion oncoproteins (and their head and tail components) visualized in Fig. S3b. Sequences for head and tail components were pulled from the best-aligned sequences in `fuson_plm/data/blast/blast_outputs/best_htg_alignments_swissprot_seqs.pkl`
+### Plotting
+Run `plot.py` to regenerate plots in Figure S3:
+```
+# Dictionary: key = run name, values = epochs. (use this option if you've trained your own model)
+# # Or "FusOn-pLM" to use official model
+FUSON_PLM_CKPT= "FusOn-pLM"
+# Type of dim reduction
+PLOT_UMAP = True
+PLOT_TSNE = False
+# Overwriting configs
+PERMISSION_TO_OVERWRITE = False                     # if False, script will halt if it believes these embeddings have already been made.
+```
+To run, use:
+```
+nohup python plot.py > plot.out 2> plot.err &
+```
+- All **results** are stored in `embedding_exploration/results/<timestamp>`, where `timestamp` is a unique string encoding the date and time when you started training.
+Below are the FusOn-pLM paper results in `results/final/umap_plots/fuson_plm/best/`:
+```
+benchmarking/
+└── embedding_exploration/
+    └── results/final/umap_plots/fuson_plm/best/
+        └── favorites/
+            ├── umap_favorites_source_data.csv
+            ├── umap_favorites_visualization.png
+        └── tf_and_kinase/
+            ├── umap_tf_and_kinase_fusions_source_data.csv                                         ├── umap_tf_and_kinase_fusions_visualization.png
+```
+- **`favorites/umap_favorites_visualization.png`**: Fig. S3b, with the data directly plotted stored in `favorites/umap_favorites_source_data.csv`
+- **`tf_and_kinase/umap_tf_and_kinase_fusions_visualization.png`**: Fig. S3a, with the data directly plotted stored in `tf_and_kinase/umap_tf_and_kinase_fusions_source_data.csv`.

fuson_plm/benchmarking/embedding_exploration/__init__.py ADDED Viewed

File without changes

fuson_plm/benchmarking/embedding_exploration/config.py ADDED Viewed

	@@ -0,0 +1,10 @@

+# Dictionary: key = run name, values = epochs. (use this option if you've trained your own model)
+# # Or, List: item goes to path (use this option if you're using the "best" ckpt from FusOn-pLM paper)
+FUSON_PLM_CKPT= "FusOn-pLM"
+# Type of dim reduction
+PLOT_UMAP = True
+PLOT_TSNE = False
+# Overwriting configs
+PERMISSION_TO_OVERWRITE = False                     # if False, script will halt if it believes these embeddings have already been made.

fuson_plm/benchmarking/embedding_exploration/data/salokas_2020_tableS3.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d8bebc0871a4329015a3c6c7843f5bbc86c48811b2a836c42f1ef46b37f4282a
+size 19626

fuson_plm/benchmarking/embedding_exploration/data/tf_and_kinase_fusions.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:372321137ed12b2f8aa7c4891dafd0e88d64d5c5d0ea9c6f3a0aa9d897e8ead6
+size 557262

fuson_plm/benchmarking/embedding_exploration/data/top_genes.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:33d568fe413107318caebd5ee260ee66fe8571461ed8f8d1b47888441f7b5034
+size 16695

fuson_plm/benchmarking/embedding_exploration/plot.py ADDED Viewed

	@@ -0,0 +1,496 @@

+import pandas as pd
+import numpy as np
+import pickle
+from sklearn.manifold import TSNE
+import matplotlib.font_manager as fm
+from matplotlib.font_manager import FontProperties
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import matplotlib.patches as patches
+import seaborn as sns
+import umap
+import os
+from fuson_plm.benchmarking.embed import embed_dataset_for_benchmark
+import fuson_plm.benchmarking.embedding_exploration.config as config
+from fuson_plm.utils.visualizing import set_font
+from fuson_plm.utils.constants import TCGA_CODES, FODB_CODES, VALID_AAS, DELIMITERS
+from fuson_plm.utils.logging import get_local_time, open_logfile, log_update, print_configpy
+def get_dimred_embeddings(embeddings, dimred_type="umap"):
+    if dimred_type=="umap":
+        dimred_embeddings = get_umap_embeddings(embeddings)
+        return dimred_embeddings
+    if dimred_type=="tsne":
+        dimred_embeddings = get_tsne_embeddings(embeddings)
+        return dimred_embeddings
+def get_tsne_embeddings(embeddings):
+    embeddings = np.array(embeddings)
+    tsne = TSNE(n_components=2, random_state=42,perplexity=5)
+    tsne_embeddings = tsne.fit_transform(embeddings)
+    return tsne_embeddings
+def get_umap_embeddings(embeddings):
+    embeddings = np.array(embeddings)
+    umap_model = umap.UMAP(n_components=2, random_state=42, n_neighbors=15, metric='euclidean') # default parameters for UMAP
+    umap_embeddings = umap_model.fit_transform(embeddings)
+    return umap_embeddings
+def plot_half_filled_circle(ax, x, y, left_color, right_color, size=100):
+    """
+    Plots a circle filled in halves with specified colors.
+    Parameters:
+    - ax: Matplotlib axis to draw on.
+    - x, y: Coordinates of the marker.
+    - left_color: Color of the left half.
+    - right_color: Color of the right half.
+    - size: Size of the marker.
+    """
+    radius = (size ** 0.5) / 100  # Scale the radius
+    # Create left half-circle (0° to 180°)
+    left_half = patches.Wedge((x, y), radius, 90, 270, color=left_color, ec="black")
+    # Create right half-circle (180° to 360°)
+    right_half = patches.Wedge((x, y), radius, 270, 90, color=right_color, ec="black")
+    # Add both halves to the plot
+    ax.add_patch(left_half)
+    ax.add_patch(right_half)
+def plot_umap_scatter_tftf_kk(df, filename="umap.png"):
+    """
+    Plots a 2D scatterplot of UMAP coordinates with different markers and colors based on 'type'.
+    Only for TF::TF and Kinase::Kinase fusions
+    Parameters:
+    - df (pd.DataFrame): DataFrame containing 'umap1', 'umap2', 'sequence', and 'type' columns.
+    """
+    set_font()
+    # Define colors for each type
+    colors = {
+        "TF": "pink",
+        "Kinase": "orange"
+    }
+    # Define marker types and colors for each combination
+    marker_colors = {
+        "TF::TF": colors["TF"],
+        "Kinase::Kinase": colors["Kinase"],
+    }
+    # Create the plot
+    fig, ax = plt.subplots(figsize=(10, 8))
+    x_min, x_max = df["umap1"].min() - 1, df["umap1"].max() + 1
+    y_min, y_max = df["umap2"].min() - 1, df["umap2"].max() + 1
+    ax.set_xlim(x_min, x_max)
+    ax.set_ylim(y_min, y_max)
+    # Plot each point with the specified half-filled marker
+    for i in range(len(df)):
+        row = df.iloc[i]
+        marker_type = row["fusion_type"]
+        x, y = row["umap1"], row["umap2"]
+        color = marker_colors[marker_type]
+        ax.scatter(x, y, color=color, s=15, edgecolors="black", linewidth=0.5)
+    # Add custom legend
+    legend_elements = [
+        patches.Patch(facecolor="pink", edgecolor="black", label="TF::TF"),
+        patches.Patch(facecolor="orange", edgecolor="black", label="Kinase::Kinase")
+    ]
+    ax.legend(handles=legend_elements, title="Fusion Type", fontsize=16, title_fontsize=16)
+    # Add labels and title
+    plt.xlabel("UMAP 1", fontsize=20)
+    plt.ylabel("UMAP 2", fontsize=20)
+    plt.title("FusOn-pLM-embedded Transcription Factor and Kinase Fusions", fontsize=20)
+    plt.tight_layout()
+    # Save and show the plot
+    plt.savefig(filename, dpi=300)
+    plt.show()
+def plot_umap_scatter_half_filled(df, filename="umap.png"):
+    """
+    Plots a 2D scatterplot of UMAP coordinates with different markers and colors based on 'type'.
+    Parameters:
+    - df (pd.DataFrame): DataFrame containing 'umap1', 'umap2', 'sequence', and 'type' columns.
+    """
+    # Define colors for each type
+    colors = {
+        "TF": "pink",
+        "Kinase": "orange",
+        "Other": "grey"
+    }
+    # Define marker types and colors for each combination
+    marker_colors = {
+        "TF::TF": {"left": colors["TF"], "right": colors["TF"]},
+        "TF::Other": {"left": colors["TF"], "right": colors["Other"]},
+        "Other::TF": {"left": colors["Other"], "right": colors["TF"]},
+        "Kinase::Kinase": {"left": colors["Kinase"], "right": colors["Kinase"]},
+        "Kinase::Other": {"left": colors["Kinase"], "right": colors["Other"]},
+        "Other::Kinase": {"left": colors["Other"], "right": colors["Kinase"]},
+        "Kinase::TF": {"left": colors["Kinase"], "right": colors["TF"]},
+        "TF::Kinase": {"left": colors["TF"], "right": colors["Kinase"]},
+        "Other::Other": {"left": colors["Other"], "right": colors["Other"]}
+    }
+    # Create the plot
+    fig, ax = plt.subplots(figsize=(10, 8))
+    x_min, x_max = df["umap1"].min() - 1, df["umap1"].max() + 1
+    y_min, y_max = df["umap2"].min() - 1, df["umap2"].max() + 1
+    ax.set_xlim(x_min, x_max)
+    ax.set_ylim(y_min, y_max)
+    # Plot each point with the specified half-filled marker
+    for i in range(len(df)):
+        row = df.iloc[i]
+        marker_type = row["fusion_type"]
+        x, y = row["umap1"], row["umap2"]
+        left_color = marker_colors[marker_type]["left"]
+        right_color = marker_colors[marker_type]["right"]
+        plot_half_filled_circle(ax, x, y, left_color, right_color, size=100)
+    # Add custom legend
+    legend_elements = [
+        patches.Patch(facecolor="pink", edgecolor="black", label="TF"),
+        patches.Patch(facecolor="orange", edgecolor="black", label="Kinase"),
+        patches.Patch(facecolor="grey", edgecolor="black", label="Other")
+    ]
+    ax.legend(handles=legend_elements, title="Type")
+    # Add labels and title
+    plt.xlabel("UMAP 1")
+    plt.ylabel("UMAP 2")
+    plt.title("UMAP Scatter Plot")
+    plt.tight_layout()
+    # Save and show the plot
+    plt.savefig(filename, dpi=300)
+    plt.show()
+def get_gene_type(gene, d):
+    if gene in d:
+        if d[gene] == 'kinase':
+            return 'Kinase'
+        if d[gene] == 'tf':
+            return 'TF'
+    else:
+        return 'Other'
+def get_tf_and_kinase_fusions_dataset():
+        # Load TF and Kinase Fusions
+    tf_kinase_parts = pd.read_csv("data/salokas_2020_tableS3.csv")
+    print(tf_kinase_parts)
+    ht_tf_kinase_dict = dict(zip(tf_kinase_parts['Gene'],tf_kinase_parts['Kinase or TF']))
+    # This one has each row with one fusiongene name
+    fuson_ht_db = pd.read_csv("../../data/blast/fuson_ht_db.csv")
+    fuson_ht_db[['hg','tg']] = fuson_ht_db['fusiongenes'].str.split("::",expand=True)
+    fuson_ht_db['hg_type'] = fuson_ht_db['hg'].apply(lambda x: get_gene_type(x, ht_tf_kinase_dict))
+    fuson_ht_db['tg_type'] = fuson_ht_db['tg'].apply(lambda x: get_gene_type(x, ht_tf_kinase_dict))
+    fuson_ht_db['fusion_type'] = fuson_ht_db['hg_type']+'::'+fuson_ht_db['tg_type']
+    fuson_ht_db['type']=['fusion']*len(fuson_ht_db)
+    # Keep 100 things in each category
+    categories = pd.DataFrame(fuson_ht_db['fusion_type'].value_counts()).reset_index()['index'].tolist()
+    categories = ["TF::TF","Kinase::Kinase"] # manually set some easier categories
+    print(categories)
+    plot_df = None
+    for i, cat in enumerate(categories):
+        random_sample = fuson_ht_db.loc[fuson_ht_db['fusion_type']==cat].reset_index(drop=True)
+        #random_sample = random_sample.sample(n=100, random_state=1).reset_index(drop=True)
+        if i==0:
+            plot_df = random_sample
+        else:
+            plot_df = pd.concat([plot_df,random_sample],axis=0).reset_index(drop=True)
+    print(plot_df['fusion_type'].value_counts())
+    # Now, need to add in the embeddings
+    plot_df = plot_df[['aa_seq','fusiongenes','fusion_type','type']].rename(
+        columns={'aa_seq':'sequence','fusiongenes':'ID'}
+    )
+    return plot_df
+def make_tf_and_kinase_fusions_plot(seqs_with_embeddings, savedir = '', dimred_type='umap'):
+    fuson_db = pd.read_csv("../../data/fuson_db.csv")
+    seq_id_dict = dict(zip(fuson_db['aa_seq'],fuson_db['seq_id']))
+    # add sequences so we can save results/sequence
+    data = seqs_with_embeddings[[f'{dimred_type}1',f'{dimred_type}2','sequence','fusion_type','ID']]
+    data['seq_id'] = data['sequence'].map(seq_id_dict)
+    tfkinase_save_dir = f"{savedir}"
+    os.makedirs(tfkinase_save_dir,exist_ok=True)
+    data.to_csv(f"{tfkinase_save_dir}/{dimred_type}_tf_and_kinase_fusions_source_data.csv",index=False)
+    plot_umap_scatter_tftf_kk(data,filename=f"{tfkinase_save_dir}/{dimred_type}_tf_and_kinase_fusions_visualization.png")
+def tf_and_kinase_fusions_plot(dimred_types, output_dir):
+    """
+    Makes the embeddings, THEN calls the plot. only on the four favorites
+    """
+    plot_df = get_tf_and_kinase_fusions_dataset()
+    plot_df.to_csv("data/tf_and_kinase_fusions.csv",index=False)
+    # path to the pkl file with FOdb embeddings
+    input_fname='tf_and_kinase'
+    all_embedding_paths = embed_dataset_for_benchmark(
+                                        fuson_ckpts=config.FUSON_PLM_CKPT,
+                                        input_data_path='data/tf_and_kinase_fusions.csv', input_fname=input_fname,
+                                        average=True, seq_col='sequence',
+                                        benchmark_fusonplm=True,
+                                        benchmark_esm=False,
+                                        benchmark_fo_puncta_ml=False,
+                                        overwrite=config.PERMISSION_TO_OVERWRITE)
+    # For each of the models we are benchmarking, load embeddings and make plots
+    log_update("\nEmbedding sequences")
+    # loop through the embedding paths and train each one
+    for embedding_path, details in all_embedding_paths.items():
+        log_update(f"\tBenchmarking embeddings at: {embedding_path}")
+        try:
+            with open(embedding_path, "rb") as f:
+                embeddings = pickle.load(f)
+        except:
+            raise Exception(f"Cannot read embeddings from {embedding_path}")
+        # combine the embeddings and splits into one dataframe
+        seqs_with_embeddings = pd.DataFrame.from_dict(embeddings.items())
+        seqs_with_embeddings = seqs_with_embeddings.rename(columns={0: 'sequence', 1: 'embedding'})    # the column that was called FusOn-pLM is now called embedding
+        seqs_with_embeddings = pd.merge(seqs_with_embeddings, plot_df, on='sequence', how='inner')
+        # get UMAP transform of the embeddings
+        for dimred_type in dimred_types:
+            dimred_embeddings = get_dimred_embeddings(seqs_with_embeddings['embedding'].tolist(),dimred_type=dimred_type)
+            # turn the result into a dataframe, and add it to seqs_with_embeddings
+            data = pd.DataFrame(dimred_embeddings, columns=[f'{dimred_type}1', f'{dimred_type}2'])
+            # save the umap data!
+            model_name = "_".join(embedding_path.split('embeddings/')[1].split('/')[1:-1])
+            seqs_with_embeddings[[f'{dimred_type}1', f'{dimred_type}2']] = data
+            # make subdirectory
+            intermediate = '/'.join(embedding_path.split('embeddings/')[1].split('/')[0:-1])
+            cur_output_dir = f"{output_dir}/{dimred_type}_plots/{intermediate}/{input_fname}"
+            os.makedirs(cur_output_dir,exist_ok=True)
+            make_tf_and_kinase_fusions_plot(seqs_with_embeddings, savedir = cur_output_dir, dimred_type=dimred_type)
+def make_fusion_v_parts_favorites_plot(seqs_with_embeddings, savedir = None, dimred_type='umap'):
+    """
+    Make plots showing that PAX3::FOXO1, EWS::FLI1, SS18::SSX1, EML4::ALK are embedded distinctly from their heads and tails
+    """
+    set_font()
+    # Load one sequence each for four proteins in the test set: PAX3::FOXO1, EWS::FLI1, SS18::SSX1, EML4::ALK
+    data = pd.read_csv("data/top_genes.csv")
+    seqs_with_embeddings = pd.merge(seqs_with_embeddings, data, on="sequence")
+    seqs_with_embeddings["Type"] = [""]*len(seqs_with_embeddings)
+    seqs_with_embeddings.loc[
+        seqs_with_embeddings["gene"].str.contains("::"),"Type"
+    ] = "fusion_embeddings"
+    heads = seqs_with_embeddings.loc[seqs_with_embeddings["gene"].str.contains("::")]["gene"].str.split("::",expand=True)[0].tolist()
+    tails = seqs_with_embeddings.loc[seqs_with_embeddings["gene"].str.contains("::")]["gene"].str.split("::",expand=True)[1].tolist()
+    seqs_with_embeddings.loc[
+        seqs_with_embeddings["gene"].isin(heads),"Type"
+    ] = "h_embeddings"
+    seqs_with_embeddings.loc[
+        seqs_with_embeddings["gene"].isin(tails),"Type"
+    ] = "t_embeddings"
+    # make merge
+    merge = seqs_with_embeddings.loc[seqs_with_embeddings['gene'].str.contains('::')].reset_index(drop=True)[['gene','sequence']]
+    merge["head"] = merge["gene"].str.split("::",expand=True)[0]
+    merge["tail"] = merge["gene"].str.split("::",expand=True)[1]
+    merge = pd.merge(merge, seqs_with_embeddings[['gene','sequence']].rename(
+    columns={'gene': 'head', 'sequence': 'h_sequence'}),
+         on='head',how='left'
+    )
+    merge = pd.merge(merge, seqs_with_embeddings[['gene','sequence']].rename(
+        columns={'gene': 'tail', 'sequence': 't_sequence'}),
+            on='tail',how='left'
+    )
+    plt.figure()
+    # Define colors and markers
+    colors = {
+        'fusion_embeddings': '#cf9dfa', # old color #0C4A4D
+        'h_embeddings': '#eb8888',   # Updated to original names; old color #619283
+        't_embeddings': '#5fa3e3',   # Updated to original names; old color #619283
+    }
+    markers = {
+        'fusion_embeddings': 'o',
+        'h_embeddings': '^',         # Updated to original names
+        't_embeddings': 'v'        # Updated to original names
+    }
+    label_map = {
+        'fusion_embeddings': 'Fusion',
+        'h_embeddings': 'Head',   # Updated label
+        't_embeddings': 'Tail',   # Updated label
+    }
+    # Create a 2x3 grid of plots
+    fig, axes = plt.subplots(2, 3, figsize=(18, 12))
+    #fig, axes = plt.subplots(1, 4, figsize= (18, 7))
+    # Get the global min and max for the x and y axis ranges
+    all_tsne1 = seqs_with_embeddings[f'{dimred_type}1']
+    all_tsne2 = seqs_with_embeddings[f'{dimred_type}2']
+    x_min, x_max = all_tsne1.min(), all_tsne1.max()
+    y_min, y_max = all_tsne2.min(), all_tsne2.max()
+    x_min, x_max = [11, 16] # manually set range for cleaner plotting
+    y_min, y_max = [10, 22]
+    # Determine tick positions
+    x_ticks = np.arange(x_min, x_max + 1, 1)
+    y_ticks = np.arange(y_min, y_max + 1, 1)
+    # Flatten the axes array for easier iteration
+    axes = axes.flatten()
+    for i, ax in enumerate(axes):
+        # Extract the gene names from the current row
+        fgene_name = merge.loc[i, 'gene']
+        hgene = merge.loc[i, 'head']
+        tgene = merge.loc[i, 'tail']
+        # Filter tsne_embeddings for the relevant entries
+        tsne_data = seqs_with_embeddings[seqs_with_embeddings['gene'].isin([fgene_name, hgene, tgene])]
+        # Plot each type
+        for emb_type in tsne_data['Type'].unique():
+            subset = tsne_data[tsne_data['Type'] == emb_type]
+            ax.scatter(subset[f'{dimred_type}1'], subset[f'{dimred_type}2'], label=label_map[emb_type], color=colors[emb_type], marker=markers[emb_type], s=120, zorder=3)
+        ax.set_title(f'{fgene_name}',fontsize=44)
+        label_transform = {
+            'tsne': 't-SNE',
+            'umap': 'UMAP'
+        }
+        ax.set_xlabel(f'{label_transform[dimred_type]} 1',fontsize=44)
+        ax.set_ylabel(f'{label_transform[dimred_type]} 2',fontsize=44)
+        ax.grid(True, which='both', linestyle='--', linewidth=0.5, color='gray', zorder=1)
+        # Set the same limits and ticks for all axes
+        ax.set_xlim(x_min, x_max)
+        ax.set_ylim(y_min, y_max)
+        ax.set_xticks(x_ticks)#\\, labelsize=24)
+        ax.set_yticks(y_ticks)#, labelsize=24)
+        # Rotate x-axis labels
+        ax.set_xticklabels(ax.get_xticks(), rotation=45, ha='right')
+        ax.tick_params(axis='x', labelsize=16)
+        ax.tick_params(axis='y', labelsize=16)
+        for label in ax.get_xticklabels():
+            label.set_fontsize(24)
+        for label in ax.get_yticklabels():
+            label.set_fontsize(24)
+        # Set font size for the legend if needed
+        if i == 0:
+            legend = ax.legend(fontsize=20, markerscale=2, loc='best')
+            for text in legend.get_texts():
+                text.set_fontsize(24)
+    # Adjust layout to prevent overlap
+    plt.tight_layout()
+    # Show the plot
+    plt.show()
+    # Save the figure
+    plt.savefig(f'{savedir}/{dimred_type}_favorites_visualization.png', dpi=300)
+    # Save the data
+    seq_to_id_dict = pd.read_csv("../../data/fuson_db.csv")
+    seq_to_id_dict = dict(zip(seq_to_id_dict['aa_seq'],seq_to_id_dict['seq_id']))
+    seqs_with_embeddings['seq_id'] = seqs_with_embeddings['sequence'].map(seq_to_id_dict)
+    seqs_with_embeddings[['umap1','umap2','sequence','Type','gene','id','seq_id']].to_csv(f"{savedir}/{dimred_type}_favorites_source_data.csv",index=False)
+def fusion_v_parts_favorites(dimred_types, output_dir):
+    """
+    Makes the embeddings, THEN calls the plot. only on the four favorites
+    """
+    # path to the pkl file with FOdb embeddings
+    input_fname='favorites'
+    all_embedding_paths = embed_dataset_for_benchmark(
+                                        fuson_ckpts=config.FUSON_PLM_CKPT,
+                                        input_data_path='data/top_genes.csv', input_fname=input_fname,
+                                        average=True, seq_col='sequence',
+                                        benchmark_fusonplm=True,
+                                        benchmark_esm=False,
+                                        benchmark_fo_puncta_ml=False,
+                                        overwrite=config.PERMISSION_TO_OVERWRITE)
+    # For each of the models we are benchmarking, load embeddings and make plots
+    log_update("\nEmbedding sequences")
+    # loop through the embedding paths and train each one
+    for embedding_path, details in all_embedding_paths.items():
+        log_update(f"\tBenchmarking embeddings at: {embedding_path}")
+        try:
+            with open(embedding_path, "rb") as f:
+                embeddings = pickle.load(f)
+        except:
+            raise Exception(f"Cannot read embeddings from {embedding_path}")
+        # combine the embeddings and splits into one dataframe
+        seqs_with_embeddings = pd.DataFrame.from_dict(embeddings.items())
+        seqs_with_embeddings = seqs_with_embeddings.rename(columns={0: 'sequence', 1: 'embedding'})    # the column that was called FusOn-pLM is now called embedding
+        # get UMAP transform of the embeddings
+        for dimred_type in dimred_types:
+            dimred_embeddings = get_dimred_embeddings(seqs_with_embeddings['embedding'].tolist(),dimred_type=dimred_type)
+            # turn the result into a dataframe, and add it to seqs_with_embeddings
+            data = pd.DataFrame(dimred_embeddings, columns=[f'{dimred_type}1', f'{dimred_type}2'])
+            # save the umap data!
+            model_name = "_".join(embedding_path.split('embeddings/')[1].split('/')[1:-1])
+            seqs_with_embeddings[[f'{dimred_type}1', f'{dimred_type}2']] = data
+            # make subdirectory
+            intermediate = '/'.join(embedding_path.split('embeddings/')[1].split('/')[0:-1])
+            cur_output_dir = f"{output_dir}/{dimred_type}_plots/{intermediate}/{input_fname}"
+            os.makedirs(cur_output_dir,exist_ok=True)
+            make_fusion_v_parts_favorites_plot(seqs_with_embeddings, savedir = cur_output_dir, dimred_type=dimred_type)
+def main():
+    # make directory to save results
+    os.makedirs('results',exist_ok=True)
+    output_dir = f'results/{get_local_time()}'
+    os.makedirs(output_dir,exist_ok=True)
+    dimred_types = []
+    if config.PLOT_UMAP:
+        dimred_types.append("umap")
+        #os.makedirs(f"{output_dir}/umap_data",exist_ok=True)
+        os.makedirs(f"{output_dir}/umap_plots",exist_ok=True)
+    if config.PLOT_TSNE:
+        dimred_types.append("tsne")
+        #os.makedirs(f"{output_dir}/tsne_data",exist_ok=True)
+        os.makedirs(f"{output_dir}/tsne_plots",exist_ok=True)
+    with open_logfile(f'{output_dir}/embedding_exploration_log.txt'):
+        print_configpy(config)
+        # make the disinct embeddings plot
+        fusion_v_parts_favorites(dimred_types, output_dir)
+        tf_and_kinase_fusions_plot(dimred_types, output_dir)
+if __name__ == "__main__":
+    main()

fuson_plm/benchmarking/embedding_exploration/results/final/umap_plots/fuson_plm/best/favorites/umap_favorites_source_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:28c0b51f513da01df3dee3c4e71aa0c583bd57d9878137bdac9e7ebc704694e4
+size 17383

fuson_plm/benchmarking/embedding_exploration/results/final/umap_plots/fuson_plm/best/favorites/umap_favorites_visualization.png ADDED Viewed

fuson_plm/benchmarking/embedding_exploration/results/final/umap_plots/fuson_plm/best/tf_and_kinase/umap_tf_and_kinase_fusions_source_data.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b26a5a6c2f8f54225fd46f01dab52813532438732624561af8e2e4ad005e5dc7
+size 570073

fuson_plm/benchmarking/embedding_exploration/results/final/umap_plots/fuson_plm/best/tf_and_kinase/umap_tf_and_kinase_fusions_visualization.png ADDED Viewed

fuson_plm/benchmarking/mutation_prediction/README.md CHANGED Viewed

@@ -81,7 +81,7 @@ To run, use:
 ```
 nohup python discover.py > discover.out 2> discover.err &
 ```
-- All **results** are stored in `idr_prediction/results/<timestamp>`, where `timestamp` is a unique string encoding the date and time when you started training.
 Below are the FusOn-pLM paper results in `results/final`:

 ```
 nohup python discover.py > discover.out 2> discover.err &
 ```
+- All **results** are stored in `mutation_prediction/results/<timestamp>`, where `timestamp` is a unique string encoding the date and time when you started training.
 Below are the FusOn-pLM paper results in `results/final`:

fuson_plm/benchmarking/puncta/train.py CHANGED Viewed

@@ -5,7 +5,7 @@ import numpy as np
 import pickle
 import os
-from fuson_plm.benchmarking.xgboost_predictor import train_final_predictor, evaluate_predictor, train_predictor_xval
 from fuson_plm.benchmarking.embed import embed_dataset_for_benchmark
 import fuson_plm.benchmarking.puncta.config as config
 from fuson_plm.benchmarking.puncta.plot import make_all_final_bar_charts

 import pickle
 import os
+from fuson_plm.benchmarking.xgboost_predictor import train_final_predictor, evaluate_predictor
 from fuson_plm.benchmarking.embed import embed_dataset_for_benchmark
 import fuson_plm.benchmarking.puncta.config as config
 from fuson_plm.benchmarking.puncta.plot import make_all_final_bar_charts

fuson_plm/benchmarking/xgboost_predictor.py ADDED Viewed

	@@ -0,0 +1,65 @@

+from sklearn.model_selection import train_test_split, StratifiedKFold
+from sklearn.metrics import accuracy_score, recall_score, f1_score, precision_score, roc_auc_score, average_precision_score
+from fuson_plm.utils.logging import log_update
+import time
+import xgboost as xgb
+import numpy as np
+import pandas as pd
+def train_final_predictor(X_train, y_train, n_estimators=50,tree_method="hist"):
+    clf = xgb.XGBClassifier(n_estimators=n_estimators, tree_method=tree_method)
+    clf.fit(X_train, y_train)
+    return clf
+def evaluate_predictor(clf,X_test,y_test,class1_thresh=None):
+    # Predicting the labels on test set
+    y_pred_test = clf.predict(X_test)   # labels with automatic thresholds
+    y_pred_prob_test = clf.predict_proba(X_test)[:, 1]
+    if class1_thresh is not None: y_pred_customthresh_test = np.where(np.array(y_pred_prob_test) >= class1_thresh, 1, 0)
+    # Calculating metrics - automatic
+    accuracy = accuracy_score(y_test, y_pred_test)
+    precision = precision_score(y_test, y_pred_test)
+    recall = recall_score(y_test, y_pred_test)
+    f1 = f1_score(y_test, y_pred_test)
+    auroc_prob = roc_auc_score(y_test, y_pred_prob_test)
+    auprc_prob = average_precision_score(y_test, y_pred_prob_test)
+    auroc_label = roc_auc_score(y_test, y_pred_test)
+    auprc_label = average_precision_score(y_test, y_pred_test)
+    automatic_stats_df = pd.DataFrame(data={
+        'Accuracy': [accuracy],
+        'Precision': [precision],
+        'Recall': [recall],
+        'F1 Score': [f1],
+        'AUROC': [auroc_prob],
+        'AUROC Label': [auroc_label],
+        'AUPRC': [auprc_prob],
+        'AUPRC Label': [auprc_label]
+    })
+    # Calculating metrics - custom threshold (note that probability ones won't change)
+    if class1_thresh is not None:
+        accuracy_custom = accuracy_score(y_test, y_pred_customthresh_test)
+        precision_custom = precision_score(y_test, y_pred_customthresh_test)
+        recall_custom = recall_score(y_test, y_pred_customthresh_test)
+        f1_custom = f1_score(y_test, y_pred_customthresh_test)
+        auroc_prob_custom = roc_auc_score(y_test, y_pred_prob_test)
+        auprc_prob_custom = average_precision_score(y_test, y_pred_prob_test)
+        auroc_label_custom = roc_auc_score(y_test, y_pred_customthresh_test)
+        auprc_label_custom = average_precision_score(y_test, y_pred_customthresh_test)
+        custom_stats_df = pd.DataFrame(data={
+            'Accuracy': [accuracy_custom],
+            'Precision': [precision_custom],
+            'Recall': [recall_custom],
+            'F1 Score': [f1_custom],
+            'AUROC': [auroc_prob_custom],
+            'AUROC Label': [auroc_label_custom],
+            'AUPRC': [auprc_prob_custom],
+            'AUPRC Label': [auprc_label_custom]
+        })
+    else:
+        custom_stats_df = None
+    return automatic_stats_df, custom_stats_df