Remove outdated environment configuration files and update README with detailed usage instructions for the Protein Location Predictor application. Add error handling for model loading in prediction utility, and include a sample protein sequence in FASTA format. Update the project structure and installation steps to reflect the latest changes.
Browse files- Data/P50307.fasta +8 -0
- Data/TaxDistributionPSORT.svg +0 -0
- Envs/requirements.txt +0 -121
- README.md +228 -15
- RepoStructure.txt +0 -29
- Envs/environment.yml β environment.yml +0 -0
- notebooks/01_EDA_Psort.ipynb +2 -2
- protein_predictor_readme.md:Zone.Identifier +3 -0
- src/my_utils.py +26 -3
Data/P50307.fasta
ADDED
|
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
>sp|P50307|METK_STAAU S-adenosylmethionine synthase OS=Staphylococcus aureus OX=1280 GN=metK PE=3 SV=1
|
| 2 |
+
MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACETTVTTGMALIAGEIS
|
| 3 |
+
TTTYVDIPKVVRETIKEIGYTRAKYGYDYETMAILTAIDEQSPDIAQGVDKALEYRDKDS
|
| 4 |
+
EEEIEATGAGDQGLMFGYATNETETYMPLAIYLSHQLAKRLSDVRKDGTLNYLRPDGKVQ
|
| 5 |
+
VTVEYDENDNPVRIDTIVVSTQHAEDVTLEQIQEDIKAHVIYPTVPENLINEQTKFYINP
|
| 6 |
+
TGRFVIGGPQGDAGLTGRKIIVDTYGGIARHGGGCFSGKDPTKVDRSAAYAARYVAKNIV
|
| 7 |
+
AAGLADQCEVQLAYAIGVAEPVSIAIDTFGTGKVSEGQLVEAVRKHFDLRPAGIIKMLDL
|
| 8 |
+
KQPIYKQTAAYGHFGRTDVLFPWEKLDKVEELKDAVK
|
Data/TaxDistributionPSORT.svg
DELETED
Envs/requirements.txt
DELETED
|
@@ -1,121 +0,0 @@
|
|
| 1 |
-
asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1733250440834/work
|
| 2 |
-
attrs @ file:///croot/attrs_1734533101012/work
|
| 3 |
-
biopython @ file:///home/builder/ci_310/biopython_1640788437968/work
|
| 4 |
-
biotite==0.41.2
|
| 5 |
-
Brotli==1.1.0
|
| 6 |
-
certifi==2025.1.31
|
| 7 |
-
charset-normalizer==3.4.1
|
| 8 |
-
cloudpathlib==0.20.0
|
| 9 |
-
comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1733502965406/work
|
| 10 |
-
contourpy==1.3.1
|
| 11 |
-
cycler==0.12.1
|
| 12 |
-
debugpy @ file:///croot/debugpy_1736267418885/work
|
| 13 |
-
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
|
| 14 |
-
dna_features_viewer==3.1.4
|
| 15 |
-
einops==0.8.1
|
| 16 |
-
entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1733327148154/work
|
| 17 |
-
esm==3.1.4
|
| 18 |
-
exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1733208806608/work
|
| 19 |
-
executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1733569351617/work
|
| 20 |
-
fastjsonschema @ file:///croot/python-fastjsonschema_1731939362158/work
|
| 21 |
-
filelock==3.17.0
|
| 22 |
-
fonttools==4.56.0
|
| 23 |
-
fsspec==2025.2.0
|
| 24 |
-
graphviz==0.20.3
|
| 25 |
-
huggingface-hub==0.29.1
|
| 26 |
-
idna==3.10
|
| 27 |
-
importlib_metadata @ file:///croot/importlib_metadata-suite_1732633488278/work
|
| 28 |
-
ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1719845459717/work
|
| 29 |
-
ipython @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_ipython_1740856895/work
|
| 30 |
-
ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1733493556527/work
|
| 31 |
-
jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1733300866624/work
|
| 32 |
-
Jinja2==3.1.5
|
| 33 |
-
joblib==1.4.2
|
| 34 |
-
jsonschema @ file:///croot/jsonschema_1728486696720/work
|
| 35 |
-
jsonschema-specifications @ file:///croot/jsonschema-specifications_1699032386549/work
|
| 36 |
-
jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1654730843242/work
|
| 37 |
-
jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1727163409502/work
|
| 38 |
-
jupyterlab_widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1733428046021/work
|
| 39 |
-
kaleido @ file:///home/conda/feedstock_root/build_artifacts/python-kaleido_1615204619408/work
|
| 40 |
-
kiwisolver==1.4.8
|
| 41 |
-
MarkupSafe==3.0.2
|
| 42 |
-
matplotlib==3.10.1
|
| 43 |
-
matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1733416936468/work
|
| 44 |
-
mkl-service==2.4.0
|
| 45 |
-
mkl_fft @ file:///io/mkl313/mkl_fft_1730824109137/work
|
| 46 |
-
mkl_random @ file:///io/mkl313/mkl_random_1730823916628/work
|
| 47 |
-
mpmath==1.3.0
|
| 48 |
-
msgpack==1.1.0
|
| 49 |
-
msgpack-numpy==0.4.8
|
| 50 |
-
narwhals @ file:///croot/narwhals_1742845957875/work
|
| 51 |
-
nbformat @ file:///croot/nbformat_1728049424075/work
|
| 52 |
-
nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1733325553580/work
|
| 53 |
-
networkx==3.4.2
|
| 54 |
-
numpy @ file:///croot/numpy_and_numpy_base_1708638617955/work/dist/numpy-1.26.4-cp310-cp310-linux_x86_64.whl#sha256=d8cd837ed43e87f77e6efaa08e8de927ca030a1c9c5d04624432d6fb9a74a5ee
|
| 55 |
-
nvidia-cublas-cu12==12.4.5.8
|
| 56 |
-
nvidia-cuda-cupti-cu12==12.4.127
|
| 57 |
-
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 58 |
-
nvidia-cuda-runtime-cu12==12.4.127
|
| 59 |
-
nvidia-cudnn-cu12==9.1.0.70
|
| 60 |
-
nvidia-cufft-cu12==11.2.1.3
|
| 61 |
-
nvidia-curand-cu12==10.3.5.147
|
| 62 |
-
nvidia-cusolver-cu12==11.6.1.9
|
| 63 |
-
nvidia-cusparse-cu12==12.3.1.170
|
| 64 |
-
nvidia-cusparselt-cu12==0.6.2
|
| 65 |
-
nvidia-nccl-cu12==2.21.5
|
| 66 |
-
nvidia-nvjitlink-cu12==12.4.127
|
| 67 |
-
nvidia-nvtx-cu12==12.4.127
|
| 68 |
-
packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1733203243479/work
|
| 69 |
-
pandas==2.2.3
|
| 70 |
-
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1733271261340/work
|
| 71 |
-
pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1733301927746/work
|
| 72 |
-
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1733327343728/work
|
| 73 |
-
pillow==11.1.0
|
| 74 |
-
platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1733232627818/work
|
| 75 |
-
plotly @ file:///home/conda/feedstock_root/build_artifacts/plotly_1742240435426/work
|
| 76 |
-
prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1737453357274/work
|
| 77 |
-
psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1653089181607/work
|
| 78 |
-
ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1733302279685/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=92c32ff62b5fd8cf325bec5ab90d7be3d2a8ca8c8a3813ff487a8d2002630d1f
|
| 79 |
-
pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1733569405015/work
|
| 80 |
-
py3Dmol==2.4.2
|
| 81 |
-
pyfaidx @ file:///opt/conda/conda-bld/pyfaidx_1728570107633/work
|
| 82 |
-
Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1736243443484/work
|
| 83 |
-
pyparsing==3.2.1
|
| 84 |
-
PyQt6==6.7.1
|
| 85 |
-
PyQt6_sip @ file:///croot/pyqt-split_1744804475988/work/pyqt_sip
|
| 86 |
-
python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1733215673016/work
|
| 87 |
-
pytz==2025.1
|
| 88 |
-
PyVCF3 @ file:///opt/conda/conda-bld/pyvcf3_1650931562118/work
|
| 89 |
-
PyYAML==6.0.2
|
| 90 |
-
pyzmq @ file:///croot/pyzmq_1734687138743/work
|
| 91 |
-
referencing @ file:///croot/referencing_1699012038513/work
|
| 92 |
-
regex==2024.11.6
|
| 93 |
-
requests==2.32.3
|
| 94 |
-
rpds-py @ file:///croot/rpds-py_1736541261634/work
|
| 95 |
-
safetensors==0.5.3
|
| 96 |
-
scikit-learn==1.6.1
|
| 97 |
-
scipy==1.15.2
|
| 98 |
-
sip @ file:///croot/sip_1738856193618/work
|
| 99 |
-
six @ file:///home/conda/feedstock_root/build_artifacts/six_1733380938961/work
|
| 100 |
-
stack_data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1733569443808/work
|
| 101 |
-
sympy==1.13.1
|
| 102 |
-
tenacity==9.0.0
|
| 103 |
-
threadpoolctl==3.5.0
|
| 104 |
-
tokenizers==0.20.3
|
| 105 |
-
tomli @ file:///opt/conda/conda-bld/tomli_1657175507142/work
|
| 106 |
-
torch==2.6.0
|
| 107 |
-
torchtext==0.18.0
|
| 108 |
-
torchvision==0.21.0
|
| 109 |
-
tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1648827254365/work
|
| 110 |
-
tqdm==4.67.1
|
| 111 |
-
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1733367359838/work
|
| 112 |
-
transformers==4.46.3
|
| 113 |
-
triton==3.2.0
|
| 114 |
-
typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1733188668063/work
|
| 115 |
-
tzdata==2025.1
|
| 116 |
-
uniprot-id-mapper==1.1.4
|
| 117 |
-
urllib3==2.3.0
|
| 118 |
-
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1733231326287/work
|
| 119 |
-
widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1733128559935/work
|
| 120 |
-
zipp @ file:///croot/zipp_1732630741423/work
|
| 121 |
-
zstd==1.5.6.5
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
README.md
CHANGED
|
@@ -1,16 +1,229 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
- en
|
| 5 |
-
metrics:
|
| 6 |
-
- accuracy
|
| 7 |
-
- precision
|
| 8 |
-
- f1
|
| 9 |
-
- recall
|
| 10 |
-
base_model:
|
| 11 |
-
- Rostlab/ProstT5
|
| 12 |
-
- EvolutionaryScale/esmc-300m-2024-12
|
| 13 |
-
- EvolutionaryScale/esmc-600m-2024-12
|
| 14 |
-
tags:
|
| 15 |
-
- biology
|
| 16 |
-
---
|
|
|
|
| 1 |
+
# Protein Location Predictor
|
| 2 |
+
|
| 3 |
+
A comprehensive GUI application for predicting protein subcellular localization using state-of-the-art machine learning models including PROST-T5 and ESM-C embeddings.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- **Multiple Model Support**: Choose from three different prediction models:
|
| 8 |
+
- PROST-T5: Transformer-based protein language model
|
| 9 |
+
- ESM-C 300M: Evolutionary Scale Modeling (300M parameters)
|
| 10 |
+
- ESM-C 600M: Evolutionary Scale Modeling (600M parameters)
|
| 11 |
+
|
| 12 |
+
- **User-Friendly GUI**: Simple Tkinter-based interface with progress tracking
|
| 13 |
+
- **Sequential Processing**: Process multiple protein sequences from FASTA files
|
| 14 |
+
- **Flexible Output**: Save predictions with confidence scores in text format
|
| 15 |
+
- **Error Handling**: Comprehensive error handling and user feedback
|
| 16 |
+
|
| 17 |
+
## Requirements
|
| 18 |
+
|
| 19 |
+
### Dependencies
|
| 20 |
+
|
| 21 |
+
The project uses conda for environment management. All dependencies are specified in `environment.yml` and include:
|
| 22 |
+
|
| 23 |
+
- PyTorch with CUDA support
|
| 24 |
+
- Transformers library
|
| 25 |
+
- ESM models
|
| 26 |
+
- Scikit-learn
|
| 27 |
+
- BioPython
|
| 28 |
+
- NumPy, Joblib
|
| 29 |
+
- Tkinter (GUI components)
|
| 30 |
+
|
| 31 |
+
### Hardware Requirements
|
| 32 |
+
|
| 33 |
+
- **Minimum**: 8GB RAM, CPU-only execution
|
| 34 |
+
- **Recommended**: 16GB+ RAM, NVIDIA GPU with 8GB+ VRAM
|
| 35 |
+
- **Storage**: ~5GB for model weights and cache
|
| 36 |
+
|
| 37 |
+
## Installation
|
| 38 |
+
|
| 39 |
+
**Prerequisites**: Conda must be installed on your system. [Download Conda](https://docs.conda.io/en/latest/miniconda.html)
|
| 40 |
+
|
| 41 |
+
1. **Clone the repository from Hugging Face**:
|
| 42 |
+
|
| 43 |
+
```bash
|
| 44 |
+
# Make sure git-lfs is installed (https://git-lfs.com)
|
| 45 |
+
git lfs install
|
| 46 |
+
|
| 47 |
+
# Clone with all files
|
| 48 |
+
git clone https://huggingface.co/jpuglia/ProteinLocationPredictor
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
**Optional - Clone without large files** (just pointers):
|
| 52 |
+
```bash
|
| 53 |
+
# If you want to clone without large files - just their pointers
|
| 54 |
+
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/jpuglia/ProteinLocationPredictor
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
2. **Navigate to the project directory**:
|
| 58 |
+
```bash
|
| 59 |
+
cd ProteinLocationPredictor
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
3. **Create conda environment**:
|
| 63 |
+
```bash
|
| 64 |
+
conda env create -n protein-predictor -f environment.yml
|
| 65 |
+
conda activate protein-predictor
|
| 66 |
+
```
|
| 67 |
+
|
| 68 |
+
4. **Pre-trained models**:
|
| 69 |
+
- Model files are included in the repository via Git LFS
|
| 70 |
+
- If you cloned without large files, you'll need to download them separately
|
| 71 |
+
|
| 72 |
+
## Usage
|
| 73 |
+
|
| 74 |
+
### Running the GUI Application
|
| 75 |
+
|
| 76 |
+
```bash
|
| 77 |
+
conda activate protein-predictor
|
| 78 |
+
python gui.py
|
| 79 |
+
```
|
| 80 |
+
|
| 81 |
+
### Step-by-Step Instructions
|
| 82 |
+
|
| 83 |
+
1. **Launch the application**
|
| 84 |
+
- Run the GUI script
|
| 85 |
+
- The main window will appear with prediction options
|
| 86 |
+
|
| 87 |
+
2. **Load a FASTA file**
|
| 88 |
+
- Click "File" β "Load FASTA"
|
| 89 |
+
- Select your protein sequences file (`.fasta`, `.fa`, or `.fas`)
|
| 90 |
+
|
| 91 |
+
3. **Choose a prediction model**
|
| 92 |
+
- **PROST-T5**
|
| 93 |
+
- **ESM-C 300M**
|
| 94 |
+
- **ESM-C 600M**
|
| 95 |
+
|
| 96 |
+
4. **Run prediction**
|
| 97 |
+
- Click the corresponding prediction button
|
| 98 |
+
- Monitor progress in the progress bar window
|
| 99 |
+
- Select output directory when prompted
|
| 100 |
+
|
| 101 |
+
5. **Save results**
|
| 102 |
+
- Choose location and filename for prediction results
|
| 103 |
+
- Results are saved in CSV format with confidence scores for each subcellular location
|
| 104 |
+
|
| 105 |
+
### Input Format
|
| 106 |
+
|
| 107 |
+
FASTA files should contain protein sequences in standard format:
|
| 108 |
+
|
| 109 |
+
```
|
| 110 |
+
>protein_1
|
| 111 |
+
MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG
|
| 112 |
+
>protein_2
|
| 113 |
+
MKTIIALSYIFCLVFAHATAKASEQTDNLQWDLAAIDNSGGHNAVDIKQNLQFQCQNNLHGCF
|
| 114 |
+
```
|
| 115 |
+
|
| 116 |
+
### Output Format
|
| 117 |
+
|
| 118 |
+
Results are saved as CSV files with predictions for 6 subcellular locations, ranked by probability:
|
| 119 |
+
|
| 120 |
+
```csv
|
| 121 |
+
Sequence_ID,Prediction 1,Prediction 2,Prediction 3,Prediction 4,Prediction 5,Prediction 6
|
| 122 |
+
sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.9860),CytoplasmicMembrane (0.0081),Periplasmic (0.0029),Extracellular (0.0019),OuterMembrane (0.0007),Cellwall (0.0003)
|
| 123 |
+
```
|
| 124 |
+
|
| 125 |
+
**Predicted Locations:**
|
| 126 |
+
- **Cytoplasmic**: Interior of the cell
|
| 127 |
+
- **CytoplasmicMembrane**: Inner membrane
|
| 128 |
+
- **Periplasmic**: Space between inner and outer membranes
|
| 129 |
+
- **Extracellular**: Outside the cell
|
| 130 |
+
- **OuterMembrane**: Outer membrane
|
| 131 |
+
- **Cellwall**: Cell wall structure
|
| 132 |
+
|
| 133 |
+
## Model Details
|
| 134 |
+
|
| 135 |
+
### PROST-T5
|
| 136 |
+
- **Base Model**: Rostlab/ProstT5
|
| 137 |
+
- **Embedding Dimension**: 1024
|
| 138 |
+
- **Classifier**: Support Vector Machine (SVM)
|
| 139 |
+
- **Memory Usage**: ~4GB GPU/8GB RAM
|
| 140 |
+
|
| 141 |
+
### ESM-C Models
|
| 142 |
+
- **Base Models**: ESM-C 300M/600M
|
| 143 |
+
- **Embedding Dimension**: Variable (300M: 960, 600M: 1280)
|
| 144 |
+
- **Classifier**: Support Vector Machine (SVM)
|
| 145 |
+
- **Memory Usage**: 300M: ~2GB GPU, 600M: ~4GB GPU
|
| 146 |
+
|
| 147 |
+
## Troubleshooting
|
| 148 |
+
|
| 149 |
+
### Common Issues
|
| 150 |
+
|
| 151 |
+
1. **Out of Memory Errors**
|
| 152 |
+
- Reduce batch size or use CPU-only mode
|
| 153 |
+
- Close other applications to free memory
|
| 154 |
+
- Try smaller model (ESM-C 300M instead of 600M)
|
| 155 |
+
|
| 156 |
+
2. **Model Loading Errors**
|
| 157 |
+
- Ensure model files are in the correct `Models/` directory
|
| 158 |
+
- Check file permissions and integrity
|
| 159 |
+
- Clear Hugging Face cache: `rm -rf ~/.cache/huggingface/`
|
| 160 |
+
|
| 161 |
+
3. **CUDA Errors**
|
| 162 |
+
- Update GPU drivers
|
| 163 |
+
- Ensure CUDA-compatible PyTorch installation
|
| 164 |
+
- Fall back to CPU mode if GPU issues persist
|
| 165 |
+
|
| 166 |
+
### Performance Tips
|
| 167 |
+
|
| 168 |
+
- **GPU Usage**: Models automatically detect and use GPU when available
|
| 169 |
+
- **Memory Management**: CUDA cache is cleared after each prediction
|
| 170 |
+
- **Sequential Processing**: Sequences are processed one at a time with progress tracking
|
| 171 |
+
|
| 172 |
+
## Project Structure
|
| 173 |
+
|
| 174 |
+
```
|
| 175 |
+
ProteinLocationPredictor/
|
| 176 |
+
βββ gui.py # Main GUI application
|
| 177 |
+
βββ src/
|
| 178 |
+
β βββ my_utils.py # Core prediction functions
|
| 179 |
+
βββ Models/ # Pre-trained model files (via Git LFS)
|
| 180 |
+
β βββ Prost T5_svm.joblib
|
| 181 |
+
β βββ Prost T5_le_svm.joblib
|
| 182 |
+
β βββ ESMC-300m_svm.joblib
|
| 183 |
+
β βββ ESMC-600m_svm.joblib
|
| 184 |
+
β βββ ...
|
| 185 |
+
βββ environment.yml # Conda environment specification
|
| 186 |
+
βββ README.md # This file
|
| 187 |
+
```
|
| 188 |
+
|
| 189 |
+
## Contributing
|
| 190 |
+
|
| 191 |
+
1. Fork the repository
|
| 192 |
+
2. Create a feature branch (`git checkout -b feature/amazing-feature`)
|
| 193 |
+
3. Commit your changes (`git commit -m 'Add amazing feature'`)
|
| 194 |
+
4. Push to the branch (`git push origin feature/amazing-feature`)
|
| 195 |
+
5. Open a Pull Request
|
| 196 |
+
|
| 197 |
+
## License
|
| 198 |
+
|
| 199 |
+
This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
|
| 200 |
+
|
| 201 |
+
## Citation
|
| 202 |
+
|
| 203 |
+
If you use this tool in your research, please cite:
|
| 204 |
+
|
| 205 |
+
```bibtex
|
| 206 |
+
@software{protein_location_predictor,
|
| 207 |
+
title={Protein Location Predictor},
|
| 208 |
+
author={Juan Diego Puglia},
|
| 209 |
+
year={2025},
|
| 210 |
+
url={https://huggingface.co/jpuglia/ProteinLocationPredictor}
|
| 211 |
+
}
|
| 212 |
+
```
|
| 213 |
+
|
| 214 |
+
## Acknowledgments
|
| 215 |
+
|
| 216 |
+
- [Rostlab](https://rostlab.org/) for the PROST-T5 model
|
| 217 |
+
- [Meta AI](https://ai.meta.com/) for the ESM models
|
| 218 |
+
- [Hugging Face](https://huggingface.co/) for model hosting and transformers library
|
| 219 |
+
- [BioPython](https://biopython.org/) for sequence handling utilities
|
| 220 |
+
|
| 221 |
+
## Contact
|
| 222 |
+
|
| 223 |
+
For questions, issues, or collaborations, please:
|
| 224 |
+
- Visit the [Hugging Face repository](https://huggingface.co/jpuglia/ProteinLocationPredictor)
|
| 225 |
+
- Open a discussion on the Hugging Face platform
|
| 226 |
+
|
| 227 |
---
|
| 228 |
+
|
| 229 |
+
**Note**: This tool is for research purposes. Please validate predictions with experimental methods for critical applications.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
RepoStructure.txt
DELETED
|
@@ -1,29 +0,0 @@
|
|
| 1 |
-
ProteinSubcellularLocPredictor/
|
| 2 |
-
β
|
| 3 |
-
βββ Data/ # Raw and processed datasets
|
| 4 |
-
β βββ raw/ # Raw, unaltered data
|
| 5 |
-
β βββ processed/ # Cleaned or feature-engineered data
|
| 6 |
-
β βββ README.md # Explain data sources and formats
|
| 7 |
-
β
|
| 8 |
-
βββ Notebooks/ # Jupyter notebooks for EDA, training, etc.
|
| 9 |
-
β βββ 01_eda.ipynb
|
| 10 |
-
β βββ 02_preprocessing.ipynb
|
| 11 |
-
β βββ 03_training.ipynb
|
| 12 |
-
β βββ 04_evaluation.ipynb
|
| 13 |
-
β
|
| 14 |
-
βββ Deployment/ # Code for using the trained model
|
| 15 |
-
β βββ predictor.py # Main script to load model and predict
|
| 16 |
-
β βββ api.py # Optional: REST API using Flask/FastAPI
|
| 17 |
-
β βββ cli.py # Optional: Command-line interface
|
| 18 |
-
β
|
| 19 |
-
βββ src/ # Python modules shared between notebooks & deployment
|
| 20 |
-
β βββ __init__.py
|
| 21 |
-
β βββ preprocessing.py # Feature engineering, tokenization, etc.
|
| 22 |
-
β βββ model.py # Model creation/training/loading
|
| 23 |
-
β βββ utils.py # Helper functions
|
| 24 |
-
β βββ config.py # Paths, constants, and config values
|
| 25 |
-
β
|
| 26 |
-
βββ .gitignore # Ignore datasets, checkpoints, virtual envs, etc.
|
| 27 |
-
βββ requirements.txt # Python package dependencies
|
| 28 |
-
βββ README.md # Project overview, setup, usage
|
| 29 |
-
βββ LICENSE # Your preferred open-source license (e.g., MIT)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
Envs/environment.yml β environment.yml
RENAMED
|
File without changes
|
notebooks/01_EDA_Psort.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:fae1ec4b018bfc4f3c580f34f7fa8e1e75a78848d2f3064778a2112fd8962fa4
|
| 3 |
+
size 10331242
|
protein_predictor_readme.md:Zone.Identifier
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[ZoneTransfer]
|
| 2 |
+
ZoneId=3
|
| 3 |
+
HostUrl=https://claude.ai/
|
src/my_utils.py
CHANGED
|
@@ -927,9 +927,32 @@ def predict_with_prost(fasta_path: str):
|
|
| 927 |
progress = ttk.Progressbar(progress_win, length=300, mode='determinate', maximum=total)
|
| 928 |
progress.pack(padx=10, pady=10)
|
| 929 |
|
| 930 |
-
# Load model/tokenizer
|
| 931 |
-
|
| 932 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
|
| 934 |
embeddings = {}
|
| 935 |
|
|
|
|
| 927 |
progress = ttk.Progressbar(progress_win, length=300, mode='determinate', maximum=total)
|
| 928 |
progress.pack(padx=10, pady=10)
|
| 929 |
|
| 930 |
+
# Load model/tokenizer with exception handling
|
| 931 |
+
try:
|
| 932 |
+
progress_label.config(text="Loading ProstT5 model...")
|
| 933 |
+
progress_win.update_idletasks()
|
| 934 |
+
|
| 935 |
+
tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False, legacy=True)
|
| 936 |
+
model = T5EncoderModel.from_pretrained("Rostlab/ProstT5")
|
| 937 |
+
|
| 938 |
+
progress_label.config(text="Model loaded successfully! Embedding sequences...")
|
| 939 |
+
progress_win.update_idletasks()
|
| 940 |
+
|
| 941 |
+
except RuntimeError as e:
|
| 942 |
+
progress_win.destroy()
|
| 943 |
+
if "Cannot allocate memory" in str(e):
|
| 944 |
+
messagebox.showerror(
|
| 945 |
+
"Memory Error",
|
| 946 |
+
"Insufficient memory to load ProstT5 model.\n\n"
|
| 947 |
+
"Please try:\n"
|
| 948 |
+
"1. Close other applications\n"
|
| 949 |
+
"2. Restart your computer\n"
|
| 950 |
+
"3. Clear the model cache:\n"
|
| 951 |
+
" rm -rf ~/.cache/huggingface/hub/models--Rostlab--ProstT5/"
|
| 952 |
+
)
|
| 953 |
+
else:
|
| 954 |
+
messagebox.showerror("Runtime Error", f"Error loading model: {str(e)}")
|
| 955 |
+
return
|
| 956 |
|
| 957 |
embeddings = {}
|
| 958 |
|