jpuglia commited on
Commit
ec4615b
Β·
1 Parent(s): eaa0e59

Remove outdated environment configuration files and update README with detailed usage instructions for the Protein Location Predictor application. Add error handling for model loading in prediction utility, and include a sample protein sequence in FASTA format. Update the project structure and installation steps to reflect the latest changes.

Browse files
Data/P50307.fasta ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ >sp|P50307|METK_STAAU S-adenosylmethionine synthase OS=Staphylococcus aureus OX=1280 GN=metK PE=3 SV=1
2
+ MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACETTVTTGMALIAGEIS
3
+ TTTYVDIPKVVRETIKEIGYTRAKYGYDYETMAILTAIDEQSPDIAQGVDKALEYRDKDS
4
+ EEEIEATGAGDQGLMFGYATNETETYMPLAIYLSHQLAKRLSDVRKDGTLNYLRPDGKVQ
5
+ VTVEYDENDNPVRIDTIVVSTQHAEDVTLEQIQEDIKAHVIYPTVPENLINEQTKFYINP
6
+ TGRFVIGGPQGDAGLTGRKIIVDTYGGIARHGGGCFSGKDPTKVDRSAAYAARYVAKNIV
7
+ AAGLADQCEVQLAYAIGVAEPVSIAIDTFGTGKVSEGQLVEAVRKHFDLRPAGIIKMLDL
8
+ KQPIYKQTAAYGHFGRTDVLFPWEKLDKVEELKDAVK
Data/TaxDistributionPSORT.svg DELETED
Envs/requirements.txt DELETED
@@ -1,121 +0,0 @@
1
- asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1733250440834/work
2
- attrs @ file:///croot/attrs_1734533101012/work
3
- biopython @ file:///home/builder/ci_310/biopython_1640788437968/work
4
- biotite==0.41.2
5
- Brotli==1.1.0
6
- certifi==2025.1.31
7
- charset-normalizer==3.4.1
8
- cloudpathlib==0.20.0
9
- comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1733502965406/work
10
- contourpy==1.3.1
11
- cycler==0.12.1
12
- debugpy @ file:///croot/debugpy_1736267418885/work
13
- decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
14
- dna_features_viewer==3.1.4
15
- einops==0.8.1
16
- entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1733327148154/work
17
- esm==3.1.4
18
- exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1733208806608/work
19
- executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1733569351617/work
20
- fastjsonschema @ file:///croot/python-fastjsonschema_1731939362158/work
21
- filelock==3.17.0
22
- fonttools==4.56.0
23
- fsspec==2025.2.0
24
- graphviz==0.20.3
25
- huggingface-hub==0.29.1
26
- idna==3.10
27
- importlib_metadata @ file:///croot/importlib_metadata-suite_1732633488278/work
28
- ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1719845459717/work
29
- ipython @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_ipython_1740856895/work
30
- ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1733493556527/work
31
- jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1733300866624/work
32
- Jinja2==3.1.5
33
- joblib==1.4.2
34
- jsonschema @ file:///croot/jsonschema_1728486696720/work
35
- jsonschema-specifications @ file:///croot/jsonschema-specifications_1699032386549/work
36
- jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1654730843242/work
37
- jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1727163409502/work
38
- jupyterlab_widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1733428046021/work
39
- kaleido @ file:///home/conda/feedstock_root/build_artifacts/python-kaleido_1615204619408/work
40
- kiwisolver==1.4.8
41
- MarkupSafe==3.0.2
42
- matplotlib==3.10.1
43
- matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1733416936468/work
44
- mkl-service==2.4.0
45
- mkl_fft @ file:///io/mkl313/mkl_fft_1730824109137/work
46
- mkl_random @ file:///io/mkl313/mkl_random_1730823916628/work
47
- mpmath==1.3.0
48
- msgpack==1.1.0
49
- msgpack-numpy==0.4.8
50
- narwhals @ file:///croot/narwhals_1742845957875/work
51
- nbformat @ file:///croot/nbformat_1728049424075/work
52
- nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1733325553580/work
53
- networkx==3.4.2
54
- numpy @ file:///croot/numpy_and_numpy_base_1708638617955/work/dist/numpy-1.26.4-cp310-cp310-linux_x86_64.whl#sha256=d8cd837ed43e87f77e6efaa08e8de927ca030a1c9c5d04624432d6fb9a74a5ee
55
- nvidia-cublas-cu12==12.4.5.8
56
- nvidia-cuda-cupti-cu12==12.4.127
57
- nvidia-cuda-nvrtc-cu12==12.4.127
58
- nvidia-cuda-runtime-cu12==12.4.127
59
- nvidia-cudnn-cu12==9.1.0.70
60
- nvidia-cufft-cu12==11.2.1.3
61
- nvidia-curand-cu12==10.3.5.147
62
- nvidia-cusolver-cu12==11.6.1.9
63
- nvidia-cusparse-cu12==12.3.1.170
64
- nvidia-cusparselt-cu12==0.6.2
65
- nvidia-nccl-cu12==2.21.5
66
- nvidia-nvjitlink-cu12==12.4.127
67
- nvidia-nvtx-cu12==12.4.127
68
- packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1733203243479/work
69
- pandas==2.2.3
70
- parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1733271261340/work
71
- pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1733301927746/work
72
- pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1733327343728/work
73
- pillow==11.1.0
74
- platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1733232627818/work
75
- plotly @ file:///home/conda/feedstock_root/build_artifacts/plotly_1742240435426/work
76
- prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1737453357274/work
77
- psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1653089181607/work
78
- ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1733302279685/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=92c32ff62b5fd8cf325bec5ab90d7be3d2a8ca8c8a3813ff487a8d2002630d1f
79
- pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1733569405015/work
80
- py3Dmol==2.4.2
81
- pyfaidx @ file:///opt/conda/conda-bld/pyfaidx_1728570107633/work
82
- Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1736243443484/work
83
- pyparsing==3.2.1
84
- PyQt6==6.7.1
85
- PyQt6_sip @ file:///croot/pyqt-split_1744804475988/work/pyqt_sip
86
- python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1733215673016/work
87
- pytz==2025.1
88
- PyVCF3 @ file:///opt/conda/conda-bld/pyvcf3_1650931562118/work
89
- PyYAML==6.0.2
90
- pyzmq @ file:///croot/pyzmq_1734687138743/work
91
- referencing @ file:///croot/referencing_1699012038513/work
92
- regex==2024.11.6
93
- requests==2.32.3
94
- rpds-py @ file:///croot/rpds-py_1736541261634/work
95
- safetensors==0.5.3
96
- scikit-learn==1.6.1
97
- scipy==1.15.2
98
- sip @ file:///croot/sip_1738856193618/work
99
- six @ file:///home/conda/feedstock_root/build_artifacts/six_1733380938961/work
100
- stack_data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1733569443808/work
101
- sympy==1.13.1
102
- tenacity==9.0.0
103
- threadpoolctl==3.5.0
104
- tokenizers==0.20.3
105
- tomli @ file:///opt/conda/conda-bld/tomli_1657175507142/work
106
- torch==2.6.0
107
- torchtext==0.18.0
108
- torchvision==0.21.0
109
- tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1648827254365/work
110
- tqdm==4.67.1
111
- traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1733367359838/work
112
- transformers==4.46.3
113
- triton==3.2.0
114
- typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1733188668063/work
115
- tzdata==2025.1
116
- uniprot-id-mapper==1.1.4
117
- urllib3==2.3.0
118
- wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1733231326287/work
119
- widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1733128559935/work
120
- zipp @ file:///croot/zipp_1732630741423/work
121
- zstd==1.5.6.5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,16 +1,229 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
- license: mit
3
- language:
4
- - en
5
- metrics:
6
- - accuracy
7
- - precision
8
- - f1
9
- - recall
10
- base_model:
11
- - Rostlab/ProstT5
12
- - EvolutionaryScale/esmc-300m-2024-12
13
- - EvolutionaryScale/esmc-600m-2024-12
14
- tags:
15
- - biology
16
- ---
 
1
+ # Protein Location Predictor
2
+
3
+ A comprehensive GUI application for predicting protein subcellular localization using state-of-the-art machine learning models including PROST-T5 and ESM-C embeddings.
4
+
5
+ ## Features
6
+
7
+ - **Multiple Model Support**: Choose from three different prediction models:
8
+ - PROST-T5: Transformer-based protein language model
9
+ - ESM-C 300M: Evolutionary Scale Modeling (300M parameters)
10
+ - ESM-C 600M: Evolutionary Scale Modeling (600M parameters)
11
+
12
+ - **User-Friendly GUI**: Simple Tkinter-based interface with progress tracking
13
+ - **Sequential Processing**: Process multiple protein sequences from FASTA files
14
+ - **Flexible Output**: Save predictions with confidence scores in text format
15
+ - **Error Handling**: Comprehensive error handling and user feedback
16
+
17
+ ## Requirements
18
+
19
+ ### Dependencies
20
+
21
+ The project uses conda for environment management. All dependencies are specified in `environment.yml` and include:
22
+
23
+ - PyTorch with CUDA support
24
+ - Transformers library
25
+ - ESM models
26
+ - Scikit-learn
27
+ - BioPython
28
+ - NumPy, Joblib
29
+ - Tkinter (GUI components)
30
+
31
+ ### Hardware Requirements
32
+
33
+ - **Minimum**: 8GB RAM, CPU-only execution
34
+ - **Recommended**: 16GB+ RAM, NVIDIA GPU with 8GB+ VRAM
35
+ - **Storage**: ~5GB for model weights and cache
36
+
37
+ ## Installation
38
+
39
+ **Prerequisites**: Conda must be installed on your system. [Download Conda](https://docs.conda.io/en/latest/miniconda.html)
40
+
41
+ 1. **Clone the repository from Hugging Face**:
42
+
43
+ ```bash
44
+ # Make sure git-lfs is installed (https://git-lfs.com)
45
+ git lfs install
46
+
47
+ # Clone with all files
48
+ git clone https://huggingface.co/jpuglia/ProteinLocationPredictor
49
+ ```
50
+
51
+ **Optional - Clone without large files** (just pointers):
52
+ ```bash
53
+ # If you want to clone without large files - just their pointers
54
+ GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/jpuglia/ProteinLocationPredictor
55
+ ```
56
+
57
+ 2. **Navigate to the project directory**:
58
+ ```bash
59
+ cd ProteinLocationPredictor
60
+ ```
61
+
62
+ 3. **Create conda environment**:
63
+ ```bash
64
+ conda env create -n protein-predictor -f environment.yml
65
+ conda activate protein-predictor
66
+ ```
67
+
68
+ 4. **Pre-trained models**:
69
+ - Model files are included in the repository via Git LFS
70
+ - If you cloned without large files, you'll need to download them separately
71
+
72
+ ## Usage
73
+
74
+ ### Running the GUI Application
75
+
76
+ ```bash
77
+ conda activate protein-predictor
78
+ python gui.py
79
+ ```
80
+
81
+ ### Step-by-Step Instructions
82
+
83
+ 1. **Launch the application**
84
+ - Run the GUI script
85
+ - The main window will appear with prediction options
86
+
87
+ 2. **Load a FASTA file**
88
+ - Click "File" β†’ "Load FASTA"
89
+ - Select your protein sequences file (`.fasta`, `.fa`, or `.fas`)
90
+
91
+ 3. **Choose a prediction model**
92
+ - **PROST-T5**
93
+ - **ESM-C 300M**
94
+ - **ESM-C 600M**
95
+
96
+ 4. **Run prediction**
97
+ - Click the corresponding prediction button
98
+ - Monitor progress in the progress bar window
99
+ - Select output directory when prompted
100
+
101
+ 5. **Save results**
102
+ - Choose location and filename for prediction results
103
+ - Results are saved in CSV format with confidence scores for each subcellular location
104
+
105
+ ### Input Format
106
+
107
+ FASTA files should contain protein sequences in standard format:
108
+
109
+ ```
110
+ >protein_1
111
+ MKTVRQERLKSIVRILERSKEPVSGAQLAEELSVSRQVIVQDIAYLRSLGYNIVATPRGYVLAGG
112
+ >protein_2
113
+ MKTIIALSYIFCLVFAHATAKASEQTDNLQWDLAAIDNSGGHNAVDIKQNLQFQCQNNLHGCF
114
+ ```
115
+
116
+ ### Output Format
117
+
118
+ Results are saved as CSV files with predictions for 6 subcellular locations, ranked by probability:
119
+
120
+ ```csv
121
+ Sequence_ID,Prediction 1,Prediction 2,Prediction 3,Prediction 4,Prediction 5,Prediction 6
122
+ sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.9860),CytoplasmicMembrane (0.0081),Periplasmic (0.0029),Extracellular (0.0019),OuterMembrane (0.0007),Cellwall (0.0003)
123
+ ```
124
+
125
+ **Predicted Locations:**
126
+ - **Cytoplasmic**: Interior of the cell
127
+ - **CytoplasmicMembrane**: Inner membrane
128
+ - **Periplasmic**: Space between inner and outer membranes
129
+ - **Extracellular**: Outside the cell
130
+ - **OuterMembrane**: Outer membrane
131
+ - **Cellwall**: Cell wall structure
132
+
133
+ ## Model Details
134
+
135
+ ### PROST-T5
136
+ - **Base Model**: Rostlab/ProstT5
137
+ - **Embedding Dimension**: 1024
138
+ - **Classifier**: Support Vector Machine (SVM)
139
+ - **Memory Usage**: ~4GB GPU/8GB RAM
140
+
141
+ ### ESM-C Models
142
+ - **Base Models**: ESM-C 300M/600M
143
+ - **Embedding Dimension**: Variable (300M: 960, 600M: 1280)
144
+ - **Classifier**: Support Vector Machine (SVM)
145
+ - **Memory Usage**: 300M: ~2GB GPU, 600M: ~4GB GPU
146
+
147
+ ## Troubleshooting
148
+
149
+ ### Common Issues
150
+
151
+ 1. **Out of Memory Errors**
152
+ - Reduce batch size or use CPU-only mode
153
+ - Close other applications to free memory
154
+ - Try smaller model (ESM-C 300M instead of 600M)
155
+
156
+ 2. **Model Loading Errors**
157
+ - Ensure model files are in the correct `Models/` directory
158
+ - Check file permissions and integrity
159
+ - Clear Hugging Face cache: `rm -rf ~/.cache/huggingface/`
160
+
161
+ 3. **CUDA Errors**
162
+ - Update GPU drivers
163
+ - Ensure CUDA-compatible PyTorch installation
164
+ - Fall back to CPU mode if GPU issues persist
165
+
166
+ ### Performance Tips
167
+
168
+ - **GPU Usage**: Models automatically detect and use GPU when available
169
+ - **Memory Management**: CUDA cache is cleared after each prediction
170
+ - **Sequential Processing**: Sequences are processed one at a time with progress tracking
171
+
172
+ ## Project Structure
173
+
174
+ ```
175
+ ProteinLocationPredictor/
176
+ β”œβ”€β”€ gui.py # Main GUI application
177
+ β”œβ”€β”€ src/
178
+ β”‚ └── my_utils.py # Core prediction functions
179
+ β”œβ”€β”€ Models/ # Pre-trained model files (via Git LFS)
180
+ β”‚ β”œβ”€β”€ Prost T5_svm.joblib
181
+ β”‚ β”œβ”€β”€ Prost T5_le_svm.joblib
182
+ β”‚ β”œβ”€β”€ ESMC-300m_svm.joblib
183
+ β”‚ β”œβ”€β”€ ESMC-600m_svm.joblib
184
+ β”‚ └── ...
185
+ β”œβ”€β”€ environment.yml # Conda environment specification
186
+ └── README.md # This file
187
+ ```
188
+
189
+ ## Contributing
190
+
191
+ 1. Fork the repository
192
+ 2. Create a feature branch (`git checkout -b feature/amazing-feature`)
193
+ 3. Commit your changes (`git commit -m 'Add amazing feature'`)
194
+ 4. Push to the branch (`git push origin feature/amazing-feature`)
195
+ 5. Open a Pull Request
196
+
197
+ ## License
198
+
199
+ This project is licensed under the MIT License - see the [LICENSE](LICENSE) file for details.
200
+
201
+ ## Citation
202
+
203
+ If you use this tool in your research, please cite:
204
+
205
+ ```bibtex
206
+ @software{protein_location_predictor,
207
+ title={Protein Location Predictor},
208
+ author={Juan Diego Puglia},
209
+ year={2025},
210
+ url={https://huggingface.co/jpuglia/ProteinLocationPredictor}
211
+ }
212
+ ```
213
+
214
+ ## Acknowledgments
215
+
216
+ - [Rostlab](https://rostlab.org/) for the PROST-T5 model
217
+ - [Meta AI](https://ai.meta.com/) for the ESM models
218
+ - [Hugging Face](https://huggingface.co/) for model hosting and transformers library
219
+ - [BioPython](https://biopython.org/) for sequence handling utilities
220
+
221
+ ## Contact
222
+
223
+ For questions, issues, or collaborations, please:
224
+ - Visit the [Hugging Face repository](https://huggingface.co/jpuglia/ProteinLocationPredictor)
225
+ - Open a discussion on the Hugging Face platform
226
+
227
  ---
228
+
229
+ **Note**: This tool is for research purposes. Please validate predictions with experimental methods for critical applications.
 
 
 
 
 
 
 
 
 
 
 
 
 
RepoStructure.txt DELETED
@@ -1,29 +0,0 @@
1
- ProteinSubcellularLocPredictor/
2
- β”‚
3
- β”œβ”€β”€ Data/ # Raw and processed datasets
4
- β”‚ β”œβ”€β”€ raw/ # Raw, unaltered data
5
- β”‚ β”œβ”€β”€ processed/ # Cleaned or feature-engineered data
6
- β”‚ └── README.md # Explain data sources and formats
7
- β”‚
8
- β”œβ”€β”€ Notebooks/ # Jupyter notebooks for EDA, training, etc.
9
- β”‚ β”œβ”€β”€ 01_eda.ipynb
10
- β”‚ β”œβ”€β”€ 02_preprocessing.ipynb
11
- β”‚ β”œβ”€β”€ 03_training.ipynb
12
- β”‚ └── 04_evaluation.ipynb
13
- β”‚
14
- β”œβ”€β”€ Deployment/ # Code for using the trained model
15
- β”‚ β”œβ”€β”€ predictor.py # Main script to load model and predict
16
- β”‚ β”œβ”€β”€ api.py # Optional: REST API using Flask/FastAPI
17
- β”‚ └── cli.py # Optional: Command-line interface
18
- β”‚
19
- β”œβ”€β”€ src/ # Python modules shared between notebooks & deployment
20
- β”‚ β”œβ”€β”€ __init__.py
21
- β”‚ β”œβ”€β”€ preprocessing.py # Feature engineering, tokenization, etc.
22
- β”‚ β”œβ”€β”€ model.py # Model creation/training/loading
23
- β”‚ β”œβ”€β”€ utils.py # Helper functions
24
- β”‚ └── config.py # Paths, constants, and config values
25
- β”‚
26
- β”œβ”€β”€ .gitignore # Ignore datasets, checkpoints, virtual envs, etc.
27
- β”œβ”€β”€ requirements.txt # Python package dependencies
28
- β”œβ”€β”€ README.md # Project overview, setup, usage
29
- └── LICENSE # Your preferred open-source license (e.g., MIT)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
Envs/environment.yml β†’ environment.yml RENAMED
File without changes
notebooks/01_EDA_Psort.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:888f3665e5b2bf5e597acbe20bb839018b5ece80c55c3bf0bfd911904399031e
3
- size 10331239
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fae1ec4b018bfc4f3c580f34f7fa8e1e75a78848d2f3064778a2112fd8962fa4
3
+ size 10331242
protein_predictor_readme.md:Zone.Identifier ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ [ZoneTransfer]
2
+ ZoneId=3
3
+ HostUrl=https://claude.ai/
src/my_utils.py CHANGED
@@ -927,9 +927,32 @@ def predict_with_prost(fasta_path: str):
927
  progress = ttk.Progressbar(progress_win, length=300, mode='determinate', maximum=total)
928
  progress.pack(padx=10, pady=10)
929
 
930
- # Load model/tokenizer once
931
- tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False, legacy=True)
932
- model = T5EncoderModel.from_pretrained("Rostlab/ProstT5")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
933
 
934
  embeddings = {}
935
 
 
927
  progress = ttk.Progressbar(progress_win, length=300, mode='determinate', maximum=total)
928
  progress.pack(padx=10, pady=10)
929
 
930
+ # Load model/tokenizer with exception handling
931
+ try:
932
+ progress_label.config(text="Loading ProstT5 model...")
933
+ progress_win.update_idletasks()
934
+
935
+ tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False, legacy=True)
936
+ model = T5EncoderModel.from_pretrained("Rostlab/ProstT5")
937
+
938
+ progress_label.config(text="Model loaded successfully! Embedding sequences...")
939
+ progress_win.update_idletasks()
940
+
941
+ except RuntimeError as e:
942
+ progress_win.destroy()
943
+ if "Cannot allocate memory" in str(e):
944
+ messagebox.showerror(
945
+ "Memory Error",
946
+ "Insufficient memory to load ProstT5 model.\n\n"
947
+ "Please try:\n"
948
+ "1. Close other applications\n"
949
+ "2. Restart your computer\n"
950
+ "3. Clear the model cache:\n"
951
+ " rm -rf ~/.cache/huggingface/hub/models--Rostlab--ProstT5/"
952
+ )
953
+ else:
954
+ messagebox.showerror("Runtime Error", f"Error loading model: {str(e)}")
955
+ return
956
 
957
  embeddings = {}
958