Upload folder using huggingface_hub
Browse files- .gitattributes +10 -0
- .gitignore +4 -0
- Data/ePSORTdb.tsv +0 -0
- Data/trainingData.csv +0 -0
- Envs/environment.yml +208 -0
- Envs/requirements.txt +121 -0
- Plots/ConfusionMatrix/CM_ESM600m.png +0 -0
- Plots/ConfusionMatrix/CM_RF_ESM300m.png +0 -0
- Plots/ConfusionMatrix/CM_RF_ProstT5.png +0 -0
- Plots/ConfusionMatrix/CM_SVM_ESM300m.png +0 -0
- Plots/ConfusionMatrix/CM_SVM_ESM600m.png +0 -0
- Plots/ConfusionMatrix/CM_SVM_ProstT5.png +0 -0
- Plots/Embeddings/PCA_ESM300m.png +3 -0
- Plots/Embeddings/PCA_ESM600m.png +3 -0
- Plots/Embeddings/PCA_ProstT5.png +3 -0
- Plots/Embeddings/UMAP_ESM300m.png +3 -0
- Plots/Embeddings/UMAP_ESM600m.png +3 -0
- Plots/Embeddings/UMAP_ProstT5.png +3 -0
- Plots/Embeddings/t-SNE_ESM300m.png +3 -0
- Plots/Embeddings/t-SNE_ESM600m.png +3 -0
- Plots/Embeddings/t-SNE_ProstT5.png +3 -0
- Plots/ModelEvaluations/RFevaluacion.png +0 -0
- Plots/ModelEvaluations/SVMevaluacion.png +0 -0
- Plots/TaxDistributionPSORT.svg +0 -0
- ProteinLocationPredictor/.gitattributes +35 -0
- ProteinLocationPredictor/README.md +3 -0
- RepoStructure.txt +29 -0
- notebooks/EDA_Psort.ipynb +0 -0
- notebooks/ESMC_300m.ipynb +421 -0
- notebooks/ESMC_600m.ipynb +256 -0
- notebooks/EmbAnalisis.ipynb +3 -0
- notebooks/ProstT5.ipynb +526 -0
- notebooks/__pycache__/my_utils.cpython-310.pyc +0 -0
- notebooks/hyperparamsRF.ipynb +0 -0
- notebooks/my_utils.py +607 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
Plots/Embeddings/PCA_ESM300m.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
Plots/Embeddings/PCA_ESM600m.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
Plots/Embeddings/PCA_ProstT5.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
Plots/Embeddings/UMAP_ESM300m.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
Plots/Embeddings/UMAP_ESM600m.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
Plots/Embeddings/UMAP_ProstT5.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
Plots/Embeddings/t-SNE_ESM300m.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
Plots/Embeddings/t-SNE_ESM600m.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
Plots/Embeddings/t-SNE_ProstT5.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
notebooks/EmbAnalisis.ipynb filter=lfs diff=lfs merge=lfs -text
|
.gitignore
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.plk
|
| 2 |
+
*.pkl
|
| 3 |
+
*.npy
|
| 4 |
+
*.joblib
|
Data/ePSORTdb.tsv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Data/trainingData.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
Envs/environment.yml
ADDED
|
@@ -0,0 +1,208 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: tesisEnv
|
| 2 |
+
channels:
|
| 3 |
+
- bioconda
|
| 4 |
+
- conda-forge
|
| 5 |
+
- anaconda
|
| 6 |
+
- https://repo.anaconda.com/pkgs/main
|
| 7 |
+
- https://repo.anaconda.com/pkgs/r
|
| 8 |
+
dependencies:
|
| 9 |
+
- _libgcc_mutex=0.1=main
|
| 10 |
+
- _openmp_mutex=5.1=1_gnu
|
| 11 |
+
- asttokens=3.0.0=py310h06a4308_0
|
| 12 |
+
- blas=1.0=mkl
|
| 13 |
+
- bzip2=1.0.8=h5eee18b_6
|
| 14 |
+
- c-ares=1.19.1=h5eee18b_0
|
| 15 |
+
- ca-certificates=2025.2.25=h06a4308_0
|
| 16 |
+
- comm=0.2.2=pyhd8ed1ab_1
|
| 17 |
+
- cyrus-sasl=2.1.28=h52b45da_1
|
| 18 |
+
- debugpy=1.8.11=py310h6a678d5_0
|
| 19 |
+
- decorator=5.2.1=pyhd8ed1ab_0
|
| 20 |
+
- entrypoints=0.4=py310h06a4308_0
|
| 21 |
+
- exceptiongroup=1.2.2=pyhd8ed1ab_1
|
| 22 |
+
- expat=2.7.1=h6a678d5_0
|
| 23 |
+
- font-ttf-dejavu-sans-mono=2.37=hd3eb1b0_0
|
| 24 |
+
- font-ttf-inconsolata=2.001=hcb22688_0
|
| 25 |
+
- font-ttf-source-code-pro=2.030=hd3eb1b0_0
|
| 26 |
+
- font-ttf-ubuntu=0.83=h8b1ccd4_0
|
| 27 |
+
- fontconfig=2.14.1=h55d465d_3
|
| 28 |
+
- fonts-anaconda=1=h8fa9717_0
|
| 29 |
+
- freetype=2.13.3=h4a9f257_0
|
| 30 |
+
- icu=73.1=h6a678d5_0
|
| 31 |
+
- importlib-metadata=8.5.0=py310h06a4308_0
|
| 32 |
+
- intel-openmp=2023.1.0=hdb19cb5_46306
|
| 33 |
+
- ipykernel=6.29.5=py310h06a4308_1
|
| 34 |
+
- ipython=8.33.0=pyh907856f_0
|
| 35 |
+
- ipywidgets=8.1.5=py310h06a4308_0
|
| 36 |
+
- jedi=0.19.2=py310h06a4308_0
|
| 37 |
+
- jpeg=9e=h5eee18b_3
|
| 38 |
+
- jsonschema=4.23.0=py310h06a4308_0
|
| 39 |
+
- jsonschema-specifications=2023.7.1=py310h06a4308_0
|
| 40 |
+
- jupyter_client=7.3.4=py310h06a4308_0
|
| 41 |
+
- jupyter_core=5.7.2=py310h06a4308_0
|
| 42 |
+
- jupyterlab_widgets=3.0.13=py310h06a4308_0
|
| 43 |
+
- kaleido-core=0.2.1=h7c8854e_0
|
| 44 |
+
- krb5=1.20.1=h143b758_1
|
| 45 |
+
- ld_impl_linux-64=2.40=h12ee557_0
|
| 46 |
+
- libabseil=20250127.0=cxx17_h6a678d5_0
|
| 47 |
+
- libcups=2.4.2=h2d74bed_1
|
| 48 |
+
- libcurl=8.12.1=hc9e6f67_0
|
| 49 |
+
- libedit=3.1.20230828=h5eee18b_0
|
| 50 |
+
- libev=4.33=h7f8727e_1
|
| 51 |
+
- libffi=3.4.4=h6a678d5_1
|
| 52 |
+
- libgcc-ng=11.2.0=h1234567_1
|
| 53 |
+
- libglib=2.78.4=hdc74915_0
|
| 54 |
+
- libgomp=11.2.0=h1234567_1
|
| 55 |
+
- libiconv=1.16=h5eee18b_3
|
| 56 |
+
- libnghttp2=1.57.0=h2d74bed_0
|
| 57 |
+
- libpng=1.6.39=h5eee18b_0
|
| 58 |
+
- libpq=17.4=hdbd6064_0
|
| 59 |
+
- libprotobuf=5.29.3=hc99497a_0
|
| 60 |
+
- libsodium=1.0.18=h7b6447c_0
|
| 61 |
+
- libssh2=1.11.1=h251f7ec_0
|
| 62 |
+
- libstdcxx-ng=11.2.0=h1234567_1
|
| 63 |
+
- libuuid=1.41.5=h5eee18b_0
|
| 64 |
+
- libxcb=1.15=h7f8727e_0
|
| 65 |
+
- libxkbcommon=1.0.1=h097e994_2
|
| 66 |
+
- libxml2=2.13.7=hfdd30dd_0
|
| 67 |
+
- lz4-c=1.9.4=h6a678d5_1
|
| 68 |
+
- mathjax=2.7.5=h06a4308_0
|
| 69 |
+
- matplotlib-inline=0.1.7=pyhd8ed1ab_1
|
| 70 |
+
- mkl=2023.1.0=h213fc3f_46344
|
| 71 |
+
- mkl-service=2.4.0=py310h5eee18b_2
|
| 72 |
+
- mkl_fft=1.3.11=py310h5eee18b_0
|
| 73 |
+
- mkl_random=1.2.8=py310h1128e8f_0
|
| 74 |
+
- mysql=8.4.0=h721767e_2
|
| 75 |
+
- narwhals=1.31.0=py310h06a4308_1
|
| 76 |
+
- nbformat=5.10.4=py310h06a4308_0
|
| 77 |
+
- ncurses=6.4=h6a678d5_0
|
| 78 |
+
- nest-asyncio=1.6.0=py310h06a4308_0
|
| 79 |
+
- nspr=4.35=h6a678d5_0
|
| 80 |
+
- nss=3.89.1=h6a678d5_0
|
| 81 |
+
- numpy=1.26.4=py310h5f9d8c6_0
|
| 82 |
+
- numpy-base=1.26.4=py310hb5e798b_0
|
| 83 |
+
- openldap=2.6.4=h42fbc30_0
|
| 84 |
+
- openssl=3.0.16=h5eee18b_0
|
| 85 |
+
- packaging=24.2=py310h06a4308_0
|
| 86 |
+
- parso=0.8.4=py310h06a4308_0
|
| 87 |
+
- pcre2=10.42=hebb0a14_1
|
| 88 |
+
- pexpect=4.9.0=pyhd8ed1ab_1
|
| 89 |
+
- pickleshare=0.7.5=pyhd3eb1b0_1003
|
| 90 |
+
- pip=25.0=py310h06a4308_0
|
| 91 |
+
- platformdirs=4.3.6=pyhd8ed1ab_1
|
| 92 |
+
- plotly=6.0.1=py310he3bba80_0
|
| 93 |
+
- prompt-toolkit=3.0.50=pyha770c72_0
|
| 94 |
+
- psutil=5.9.1=py310h5764c6d_0
|
| 95 |
+
- ptyprocess=0.7.0=pyhd3eb1b0_2
|
| 96 |
+
- pure_eval=0.2.3=pyhd8ed1ab_1
|
| 97 |
+
- pyfaidx=0.8.1.3=pyhdfd78af_0
|
| 98 |
+
- pygments=2.19.1=py310h06a4308_0
|
| 99 |
+
- pyqt=6.7.1=py310h6a678d5_1
|
| 100 |
+
- pyqt6-sip=13.9.1=py310h5eee18b_1
|
| 101 |
+
- python=3.10.16=he870216_1
|
| 102 |
+
- python-dateutil=2.9.0.post0=py310h06a4308_1
|
| 103 |
+
- python-fastjsonschema=2.20.0=py310h06a4308_0
|
| 104 |
+
- python-kaleido=0.2.1=py310h06a4308_0
|
| 105 |
+
- python_abi=3.10=2_cp310
|
| 106 |
+
- pyvcf3=1.0.3=pyhdfd78af_0
|
| 107 |
+
- pyzmq=26.2.0=py310h6a678d5_0
|
| 108 |
+
- qtbase=6.7.3=hdaa5aa8_0
|
| 109 |
+
- qtdeclarative=6.7.3=h6a678d5_0
|
| 110 |
+
- qtsvg=6.7.3=he621ea3_0
|
| 111 |
+
- qttools=6.7.3=h80c7b02_0
|
| 112 |
+
- qtwebchannel=6.7.3=h6a678d5_0
|
| 113 |
+
- qtwebsockets=6.7.3=h6a678d5_0
|
| 114 |
+
- readline=8.2=h5eee18b_0
|
| 115 |
+
- referencing=0.30.2=py310h06a4308_0
|
| 116 |
+
- rpds-py=0.22.3=py310h4aa5aa6_0
|
| 117 |
+
- setuptools=75.8.0=py310h06a4308_0
|
| 118 |
+
- sip=6.10.0=py310h6a678d5_0
|
| 119 |
+
- six=1.17.0=py310h06a4308_0
|
| 120 |
+
- sqlite=3.45.3=h5eee18b_0
|
| 121 |
+
- stack_data=0.6.3=pyhd8ed1ab_1
|
| 122 |
+
- tbb=2021.8.0=hdb19cb5_0
|
| 123 |
+
- tk=8.6.14=h39e8969_0
|
| 124 |
+
- tomli=2.0.1=py310h06a4308_0
|
| 125 |
+
- tornado=6.1=py310h7f8727e_0
|
| 126 |
+
- traitlets=5.14.3=py310h06a4308_0
|
| 127 |
+
- typing_extensions=4.12.2=py310h06a4308_0
|
| 128 |
+
- wcwidth=0.2.13=pyhd8ed1ab_1
|
| 129 |
+
- wheel=0.45.1=py310h06a4308_0
|
| 130 |
+
- widgetsnbextension=4.0.13=py310h06a4308_0
|
| 131 |
+
- xcb-util-cursor=0.1.4=h5eee18b_0
|
| 132 |
+
- xz=5.6.4=h5eee18b_1
|
| 133 |
+
- zeromq=4.3.5=h6a678d5_0
|
| 134 |
+
- zipp=3.21.0=py310h06a4308_0
|
| 135 |
+
- zlib=1.2.13=h5eee18b_1
|
| 136 |
+
- pip:
|
| 137 |
+
- attrs==25.1.0
|
| 138 |
+
- biopython==1.85
|
| 139 |
+
- biotite==0.41.2
|
| 140 |
+
- brotli==1.1.0
|
| 141 |
+
- certifi==2025.1.31
|
| 142 |
+
- charset-normalizer==3.4.1
|
| 143 |
+
- cloudpathlib==0.20.0
|
| 144 |
+
- contourpy==1.3.1
|
| 145 |
+
- cycler==0.12.1
|
| 146 |
+
- dna-features-viewer==3.1.4
|
| 147 |
+
- einops==0.8.1
|
| 148 |
+
- esm==3.1.4
|
| 149 |
+
- executing==2.2.0
|
| 150 |
+
- filelock==3.17.0
|
| 151 |
+
- fonttools==4.56.0
|
| 152 |
+
- fsspec==2025.2.0
|
| 153 |
+
- graphql-core==3.2.6
|
| 154 |
+
- graphviz==0.20.3
|
| 155 |
+
- huggingface-hub==0.29.1
|
| 156 |
+
- idna==3.10
|
| 157 |
+
- jinja2==3.1.5
|
| 158 |
+
- joblib==1.4.2
|
| 159 |
+
- kiwisolver==1.4.8
|
| 160 |
+
- markupsafe==3.0.2
|
| 161 |
+
- matplotlib==3.10.1
|
| 162 |
+
- mpmath==1.3.0
|
| 163 |
+
- msgpack==1.1.0
|
| 164 |
+
- msgpack-numpy==0.4.8
|
| 165 |
+
- networkx==3.4.2
|
| 166 |
+
- nvidia-cublas-cu12==12.4.5.8
|
| 167 |
+
- nvidia-cuda-cupti-cu12==12.4.127
|
| 168 |
+
- nvidia-cuda-nvrtc-cu12==12.4.127
|
| 169 |
+
- nvidia-cuda-runtime-cu12==12.4.127
|
| 170 |
+
- nvidia-cudnn-cu12==9.1.0.70
|
| 171 |
+
- nvidia-cufft-cu12==11.2.1.3
|
| 172 |
+
- nvidia-curand-cu12==10.3.5.147
|
| 173 |
+
- nvidia-cusolver-cu12==11.6.1.9
|
| 174 |
+
- nvidia-cusparse-cu12==12.3.1.170
|
| 175 |
+
- nvidia-cusparselt-cu12==0.6.2
|
| 176 |
+
- nvidia-nccl-cu12==2.21.5
|
| 177 |
+
- nvidia-nvjitlink-cu12==12.4.127
|
| 178 |
+
- nvidia-nvtx-cu12==12.4.127
|
| 179 |
+
- pandas==2.2.3
|
| 180 |
+
- pillow==11.1.0
|
| 181 |
+
- protobuf==6.31.0
|
| 182 |
+
- py3dmol==2.4.2
|
| 183 |
+
- pyparsing==3.2.1
|
| 184 |
+
- pytz==2025.1
|
| 185 |
+
- pyyaml==6.0.2
|
| 186 |
+
- rcsb-api==1.1.3
|
| 187 |
+
- regex==2024.11.6
|
| 188 |
+
- requests==2.32.3
|
| 189 |
+
- rustworkx==0.16.0
|
| 190 |
+
- safetensors==0.5.3
|
| 191 |
+
- scikit-learn==1.6.1
|
| 192 |
+
- scipy==1.15.2
|
| 193 |
+
- sentencepiece==0.2.0
|
| 194 |
+
- sympy==1.13.1
|
| 195 |
+
- tenacity==9.0.0
|
| 196 |
+
- threadpoolctl==3.5.0
|
| 197 |
+
- tokenizers==0.20.3
|
| 198 |
+
- torch==2.6.0
|
| 199 |
+
- torchtext==0.18.0
|
| 200 |
+
- torchvision==0.21.0
|
| 201 |
+
- tqdm==4.67.1
|
| 202 |
+
- transformers==4.46.3
|
| 203 |
+
- triton==3.2.0
|
| 204 |
+
- tzdata==2025.1
|
| 205 |
+
- uniprot-id-mapper==1.1.4
|
| 206 |
+
- urllib3==2.3.0
|
| 207 |
+
- zstd==1.5.6.5
|
| 208 |
+
prefix: /home/jpuglia/miniconda3/envs/tesisEnv
|
Envs/requirements.txt
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1733250440834/work
|
| 2 |
+
attrs @ file:///croot/attrs_1734533101012/work
|
| 3 |
+
biopython @ file:///home/builder/ci_310/biopython_1640788437968/work
|
| 4 |
+
biotite==0.41.2
|
| 5 |
+
Brotli==1.1.0
|
| 6 |
+
certifi==2025.1.31
|
| 7 |
+
charset-normalizer==3.4.1
|
| 8 |
+
cloudpathlib==0.20.0
|
| 9 |
+
comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1733502965406/work
|
| 10 |
+
contourpy==1.3.1
|
| 11 |
+
cycler==0.12.1
|
| 12 |
+
debugpy @ file:///croot/debugpy_1736267418885/work
|
| 13 |
+
decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
|
| 14 |
+
dna_features_viewer==3.1.4
|
| 15 |
+
einops==0.8.1
|
| 16 |
+
entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1733327148154/work
|
| 17 |
+
esm==3.1.4
|
| 18 |
+
exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1733208806608/work
|
| 19 |
+
executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1733569351617/work
|
| 20 |
+
fastjsonschema @ file:///croot/python-fastjsonschema_1731939362158/work
|
| 21 |
+
filelock==3.17.0
|
| 22 |
+
fonttools==4.56.0
|
| 23 |
+
fsspec==2025.2.0
|
| 24 |
+
graphviz==0.20.3
|
| 25 |
+
huggingface-hub==0.29.1
|
| 26 |
+
idna==3.10
|
| 27 |
+
importlib_metadata @ file:///croot/importlib_metadata-suite_1732633488278/work
|
| 28 |
+
ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1719845459717/work
|
| 29 |
+
ipython @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_ipython_1740856895/work
|
| 30 |
+
ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1733493556527/work
|
| 31 |
+
jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1733300866624/work
|
| 32 |
+
Jinja2==3.1.5
|
| 33 |
+
joblib==1.4.2
|
| 34 |
+
jsonschema @ file:///croot/jsonschema_1728486696720/work
|
| 35 |
+
jsonschema-specifications @ file:///croot/jsonschema-specifications_1699032386549/work
|
| 36 |
+
jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1654730843242/work
|
| 37 |
+
jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1727163409502/work
|
| 38 |
+
jupyterlab_widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1733428046021/work
|
| 39 |
+
kaleido @ file:///home/conda/feedstock_root/build_artifacts/python-kaleido_1615204619408/work
|
| 40 |
+
kiwisolver==1.4.8
|
| 41 |
+
MarkupSafe==3.0.2
|
| 42 |
+
matplotlib==3.10.1
|
| 43 |
+
matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1733416936468/work
|
| 44 |
+
mkl-service==2.4.0
|
| 45 |
+
mkl_fft @ file:///io/mkl313/mkl_fft_1730824109137/work
|
| 46 |
+
mkl_random @ file:///io/mkl313/mkl_random_1730823916628/work
|
| 47 |
+
mpmath==1.3.0
|
| 48 |
+
msgpack==1.1.0
|
| 49 |
+
msgpack-numpy==0.4.8
|
| 50 |
+
narwhals @ file:///croot/narwhals_1742845957875/work
|
| 51 |
+
nbformat @ file:///croot/nbformat_1728049424075/work
|
| 52 |
+
nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1733325553580/work
|
| 53 |
+
networkx==3.4.2
|
| 54 |
+
numpy @ file:///croot/numpy_and_numpy_base_1708638617955/work/dist/numpy-1.26.4-cp310-cp310-linux_x86_64.whl#sha256=d8cd837ed43e87f77e6efaa08e8de927ca030a1c9c5d04624432d6fb9a74a5ee
|
| 55 |
+
nvidia-cublas-cu12==12.4.5.8
|
| 56 |
+
nvidia-cuda-cupti-cu12==12.4.127
|
| 57 |
+
nvidia-cuda-nvrtc-cu12==12.4.127
|
| 58 |
+
nvidia-cuda-runtime-cu12==12.4.127
|
| 59 |
+
nvidia-cudnn-cu12==9.1.0.70
|
| 60 |
+
nvidia-cufft-cu12==11.2.1.3
|
| 61 |
+
nvidia-curand-cu12==10.3.5.147
|
| 62 |
+
nvidia-cusolver-cu12==11.6.1.9
|
| 63 |
+
nvidia-cusparse-cu12==12.3.1.170
|
| 64 |
+
nvidia-cusparselt-cu12==0.6.2
|
| 65 |
+
nvidia-nccl-cu12==2.21.5
|
| 66 |
+
nvidia-nvjitlink-cu12==12.4.127
|
| 67 |
+
nvidia-nvtx-cu12==12.4.127
|
| 68 |
+
packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1733203243479/work
|
| 69 |
+
pandas==2.2.3
|
| 70 |
+
parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1733271261340/work
|
| 71 |
+
pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1733301927746/work
|
| 72 |
+
pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1733327343728/work
|
| 73 |
+
pillow==11.1.0
|
| 74 |
+
platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1733232627818/work
|
| 75 |
+
plotly @ file:///home/conda/feedstock_root/build_artifacts/plotly_1742240435426/work
|
| 76 |
+
prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1737453357274/work
|
| 77 |
+
psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1653089181607/work
|
| 78 |
+
ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1733302279685/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=92c32ff62b5fd8cf325bec5ab90d7be3d2a8ca8c8a3813ff487a8d2002630d1f
|
| 79 |
+
pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1733569405015/work
|
| 80 |
+
py3Dmol==2.4.2
|
| 81 |
+
pyfaidx @ file:///opt/conda/conda-bld/pyfaidx_1728570107633/work
|
| 82 |
+
Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1736243443484/work
|
| 83 |
+
pyparsing==3.2.1
|
| 84 |
+
PyQt6==6.7.1
|
| 85 |
+
PyQt6_sip @ file:///croot/pyqt-split_1744804475988/work/pyqt_sip
|
| 86 |
+
python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1733215673016/work
|
| 87 |
+
pytz==2025.1
|
| 88 |
+
PyVCF3 @ file:///opt/conda/conda-bld/pyvcf3_1650931562118/work
|
| 89 |
+
PyYAML==6.0.2
|
| 90 |
+
pyzmq @ file:///croot/pyzmq_1734687138743/work
|
| 91 |
+
referencing @ file:///croot/referencing_1699012038513/work
|
| 92 |
+
regex==2024.11.6
|
| 93 |
+
requests==2.32.3
|
| 94 |
+
rpds-py @ file:///croot/rpds-py_1736541261634/work
|
| 95 |
+
safetensors==0.5.3
|
| 96 |
+
scikit-learn==1.6.1
|
| 97 |
+
scipy==1.15.2
|
| 98 |
+
sip @ file:///croot/sip_1738856193618/work
|
| 99 |
+
six @ file:///home/conda/feedstock_root/build_artifacts/six_1733380938961/work
|
| 100 |
+
stack_data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1733569443808/work
|
| 101 |
+
sympy==1.13.1
|
| 102 |
+
tenacity==9.0.0
|
| 103 |
+
threadpoolctl==3.5.0
|
| 104 |
+
tokenizers==0.20.3
|
| 105 |
+
tomli @ file:///opt/conda/conda-bld/tomli_1657175507142/work
|
| 106 |
+
torch==2.6.0
|
| 107 |
+
torchtext==0.18.0
|
| 108 |
+
torchvision==0.21.0
|
| 109 |
+
tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1648827254365/work
|
| 110 |
+
tqdm==4.67.1
|
| 111 |
+
traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1733367359838/work
|
| 112 |
+
transformers==4.46.3
|
| 113 |
+
triton==3.2.0
|
| 114 |
+
typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1733188668063/work
|
| 115 |
+
tzdata==2025.1
|
| 116 |
+
uniprot-id-mapper==1.1.4
|
| 117 |
+
urllib3==2.3.0
|
| 118 |
+
wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1733231326287/work
|
| 119 |
+
widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1733128559935/work
|
| 120 |
+
zipp @ file:///croot/zipp_1732630741423/work
|
| 121 |
+
zstd==1.5.6.5
|
Plots/ConfusionMatrix/CM_ESM600m.png
ADDED
|
Plots/ConfusionMatrix/CM_RF_ESM300m.png
ADDED
|
Plots/ConfusionMatrix/CM_RF_ProstT5.png
ADDED
|
Plots/ConfusionMatrix/CM_SVM_ESM300m.png
ADDED
|
Plots/ConfusionMatrix/CM_SVM_ESM600m.png
ADDED
|
Plots/ConfusionMatrix/CM_SVM_ProstT5.png
ADDED
|
Plots/Embeddings/PCA_ESM300m.png
ADDED
|
Git LFS Details
|
Plots/Embeddings/PCA_ESM600m.png
ADDED
|
Git LFS Details
|
Plots/Embeddings/PCA_ProstT5.png
ADDED
|
Git LFS Details
|
Plots/Embeddings/UMAP_ESM300m.png
ADDED
|
Git LFS Details
|
Plots/Embeddings/UMAP_ESM600m.png
ADDED
|
Git LFS Details
|
Plots/Embeddings/UMAP_ProstT5.png
ADDED
|
Git LFS Details
|
Plots/Embeddings/t-SNE_ESM300m.png
ADDED
|
Git LFS Details
|
Plots/Embeddings/t-SNE_ESM600m.png
ADDED
|
Git LFS Details
|
Plots/Embeddings/t-SNE_ProstT5.png
ADDED
|
Git LFS Details
|
Plots/ModelEvaluations/RFevaluacion.png
ADDED
|
Plots/ModelEvaluations/SVMevaluacion.png
ADDED
|
Plots/TaxDistributionPSORT.svg
ADDED
|
|
ProteinLocationPredictor/.gitattributes
ADDED
|
@@ -0,0 +1,35 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
+
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
+
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
+
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
+
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
+
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
+
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
+
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
+
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
+
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
+
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
+
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
+
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
+
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
+
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
+
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
+
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
+
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
+
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
+
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
+
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
+
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
+
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
+
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
+
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
+
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
+
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
+
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
+
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
+
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
+
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
+
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
+
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
+
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
ProteinLocationPredictor/README.md
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: mit
|
| 3 |
+
---
|
RepoStructure.txt
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
ProteinSubcellularLocPredictor/
|
| 2 |
+
│
|
| 3 |
+
├── Data/ # Raw and processed datasets
|
| 4 |
+
│ ├── raw/ # Raw, unaltered data
|
| 5 |
+
│ ├── processed/ # Cleaned or feature-engineered data
|
| 6 |
+
│ └── README.md # Explain data sources and formats
|
| 7 |
+
│
|
| 8 |
+
├── Notebooks/ # Jupyter notebooks for EDA, training, etc.
|
| 9 |
+
│ ├── 01_eda.ipynb
|
| 10 |
+
│ ├── 02_preprocessing.ipynb
|
| 11 |
+
│ ├── 03_training.ipynb
|
| 12 |
+
│ └── 04_evaluation.ipynb
|
| 13 |
+
│
|
| 14 |
+
├── Deployment/ # Code for using the trained model
|
| 15 |
+
│ ├── predictor.py # Main script to load model and predict
|
| 16 |
+
│ ├── api.py # Optional: REST API using Flask/FastAPI
|
| 17 |
+
│ └── cli.py # Optional: Command-line interface
|
| 18 |
+
│
|
| 19 |
+
├── src/ # Python modules shared between notebooks & deployment
|
| 20 |
+
│ ├── __init__.py
|
| 21 |
+
│ ├── preprocessing.py # Feature engineering, tokenization, etc.
|
| 22 |
+
│ ├── model.py # Model creation/training/loading
|
| 23 |
+
│ ├── utils.py # Helper functions
|
| 24 |
+
│ └── config.py # Paths, constants, and config values
|
| 25 |
+
│
|
| 26 |
+
├── .gitignore # Ignore datasets, checkpoints, virtual envs, etc.
|
| 27 |
+
├── requirements.txt # Python package dependencies
|
| 28 |
+
├── README.md # Project overview, setup, usage
|
| 29 |
+
└── LICENSE # Your preferred open-source license (e.g., MIT)
|
notebooks/EDA_Psort.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/ESMC_300m.ipynb
ADDED
|
@@ -0,0 +1,421 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "c409c4ad",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"from esm.models.esmc import ESMC\n",
|
| 11 |
+
"from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput\n",
|
| 12 |
+
"from esm.sdk.forge import ESM3ForgeInferenceClient\n",
|
| 13 |
+
"import pandas as pd\n",
|
| 14 |
+
"import os\n",
|
| 15 |
+
"from concurrent.futures import ProcessPoolExecutor, as_completed\n",
|
| 16 |
+
"from tqdm import tqdm\n",
|
| 17 |
+
"import numpy as np\n",
|
| 18 |
+
"import os\n",
|
| 19 |
+
"import torch\n",
|
| 20 |
+
"import gc"
|
| 21 |
+
]
|
| 22 |
+
},
|
| 23 |
+
{
|
| 24 |
+
"cell_type": "code",
|
| 25 |
+
"execution_count": 4,
|
| 26 |
+
"id": "7f8f916c",
|
| 27 |
+
"metadata": {},
|
| 28 |
+
"outputs": [
|
| 29 |
+
{
|
| 30 |
+
"data": {
|
| 31 |
+
"text/html": [
|
| 32 |
+
"<div>\n",
|
| 33 |
+
"<style scoped>\n",
|
| 34 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 35 |
+
" vertical-align: middle;\n",
|
| 36 |
+
" }\n",
|
| 37 |
+
"\n",
|
| 38 |
+
" .dataframe tbody tr th {\n",
|
| 39 |
+
" vertical-align: top;\n",
|
| 40 |
+
" }\n",
|
| 41 |
+
"\n",
|
| 42 |
+
" .dataframe thead th {\n",
|
| 43 |
+
" text-align: right;\n",
|
| 44 |
+
" }\n",
|
| 45 |
+
"</style>\n",
|
| 46 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 47 |
+
" <thead>\n",
|
| 48 |
+
" <tr style=\"text-align: right;\">\n",
|
| 49 |
+
" <th></th>\n",
|
| 50 |
+
" <th>SwissProt_ID</th>\n",
|
| 51 |
+
" <th>Refseq_Accession</th>\n",
|
| 52 |
+
" <th>Other_Accession</th>\n",
|
| 53 |
+
" <th>GramStain</th>\n",
|
| 54 |
+
" <th>Experimental_Localization</th>\n",
|
| 55 |
+
" <th>Phylum</th>\n",
|
| 56 |
+
" <th>Class</th>\n",
|
| 57 |
+
" <th>Organism</th>\n",
|
| 58 |
+
" <th>sequence</th>\n",
|
| 59 |
+
" </tr>\n",
|
| 60 |
+
" </thead>\n",
|
| 61 |
+
" <tbody>\n",
|
| 62 |
+
" <tr>\n",
|
| 63 |
+
" <th>0</th>\n",
|
| 64 |
+
" <td>P50307</td>\n",
|
| 65 |
+
" <td>NaN</td>\n",
|
| 66 |
+
" <td>NaN</td>\n",
|
| 67 |
+
" <td>Gram positive</td>\n",
|
| 68 |
+
" <td>Cytoplasmic</td>\n",
|
| 69 |
+
" <td>Firmicutes</td>\n",
|
| 70 |
+
" <td>Bacilli</td>\n",
|
| 71 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 72 |
+
" <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
|
| 73 |
+
" </tr>\n",
|
| 74 |
+
" <tr>\n",
|
| 75 |
+
" <th>1</th>\n",
|
| 76 |
+
" <td>P01552</td>\n",
|
| 77 |
+
" <td>NaN</td>\n",
|
| 78 |
+
" <td>NaN</td>\n",
|
| 79 |
+
" <td>Gram positive</td>\n",
|
| 80 |
+
" <td>Extracellular</td>\n",
|
| 81 |
+
" <td>Firmicutes</td>\n",
|
| 82 |
+
" <td>Bacilli</td>\n",
|
| 83 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 84 |
+
" <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
|
| 85 |
+
" </tr>\n",
|
| 86 |
+
" <tr>\n",
|
| 87 |
+
" <th>2</th>\n",
|
| 88 |
+
" <td>P09978</td>\n",
|
| 89 |
+
" <td>NaN</td>\n",
|
| 90 |
+
" <td>NaN</td>\n",
|
| 91 |
+
" <td>Gram positive</td>\n",
|
| 92 |
+
" <td>Extracellular</td>\n",
|
| 93 |
+
" <td>Firmicutes</td>\n",
|
| 94 |
+
" <td>Bacilli</td>\n",
|
| 95 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 96 |
+
" <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
|
| 97 |
+
" </tr>\n",
|
| 98 |
+
" <tr>\n",
|
| 99 |
+
" <th>3</th>\n",
|
| 100 |
+
" <td>P45723</td>\n",
|
| 101 |
+
" <td>NaN</td>\n",
|
| 102 |
+
" <td>NaN</td>\n",
|
| 103 |
+
" <td>Gram positive</td>\n",
|
| 104 |
+
" <td>Extracellular</td>\n",
|
| 105 |
+
" <td>Firmicutes</td>\n",
|
| 106 |
+
" <td>Bacilli</td>\n",
|
| 107 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 108 |
+
" <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
|
| 109 |
+
" </tr>\n",
|
| 110 |
+
" <tr>\n",
|
| 111 |
+
" <th>4</th>\n",
|
| 112 |
+
" <td>P81177</td>\n",
|
| 113 |
+
" <td>NaN</td>\n",
|
| 114 |
+
" <td>NaN</td>\n",
|
| 115 |
+
" <td>Gram positive</td>\n",
|
| 116 |
+
" <td>Extracellular</td>\n",
|
| 117 |
+
" <td>Firmicutes</td>\n",
|
| 118 |
+
" <td>Bacilli</td>\n",
|
| 119 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 120 |
+
" <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
|
| 121 |
+
" </tr>\n",
|
| 122 |
+
" </tbody>\n",
|
| 123 |
+
"</table>\n",
|
| 124 |
+
"</div>"
|
| 125 |
+
],
|
| 126 |
+
"text/plain": [
|
| 127 |
+
" SwissProt_ID Refseq_Accession Other_Accession GramStain \\\n",
|
| 128 |
+
"0 P50307 NaN NaN Gram positive \n",
|
| 129 |
+
"1 P01552 NaN NaN Gram positive \n",
|
| 130 |
+
"2 P09978 NaN NaN Gram positive \n",
|
| 131 |
+
"3 P45723 NaN NaN Gram positive \n",
|
| 132 |
+
"4 P81177 NaN NaN Gram positive \n",
|
| 133 |
+
"\n",
|
| 134 |
+
" Experimental_Localization Phylum Class Organism \\\n",
|
| 135 |
+
"0 Cytoplasmic Firmicutes Bacilli Staphylococcus aureus \n",
|
| 136 |
+
"1 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
|
| 137 |
+
"2 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
|
| 138 |
+
"3 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
|
| 139 |
+
"4 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
|
| 140 |
+
"\n",
|
| 141 |
+
" sequence \n",
|
| 142 |
+
"0 MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
|
| 143 |
+
"1 MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
|
| 144 |
+
"2 MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
|
| 145 |
+
"3 MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
|
| 146 |
+
"4 MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... "
|
| 147 |
+
]
|
| 148 |
+
},
|
| 149 |
+
"execution_count": 4,
|
| 150 |
+
"metadata": {},
|
| 151 |
+
"output_type": "execute_result"
|
| 152 |
+
}
|
| 153 |
+
],
|
| 154 |
+
"source": [
|
| 155 |
+
"sequences: pd.DataFrame = pd.read_csv('../Data/trainingData.csv')\n",
|
| 156 |
+
"sequences.head()"
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
{
|
| 160 |
+
"cell_type": "code",
|
| 161 |
+
"execution_count": null,
|
| 162 |
+
"id": "07a49fd0",
|
| 163 |
+
"metadata": {},
|
| 164 |
+
"outputs": [],
|
| 165 |
+
"source": [
|
| 166 |
+
"client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_300m\").to(\"cuda\")"
|
| 167 |
+
]
|
| 168 |
+
},
|
| 169 |
+
{
|
| 170 |
+
"cell_type": "code",
|
| 171 |
+
"execution_count": null,
|
| 172 |
+
"id": "e562c770",
|
| 173 |
+
"metadata": {},
|
| 174 |
+
"outputs": [],
|
| 175 |
+
"source": [
|
| 176 |
+
"# Set up output directories and metadata file.\n",
|
| 177 |
+
"embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm300m/embeddings\")\n",
|
| 178 |
+
"os.makedirs(embeddings_dir, exist_ok=True)"
|
| 179 |
+
]
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"cell_type": "code",
|
| 183 |
+
"execution_count": null,
|
| 184 |
+
"id": "294c6798",
|
| 185 |
+
"metadata": {},
|
| 186 |
+
"outputs": [],
|
| 187 |
+
"source": [
|
| 188 |
+
"# --- Your provided function ---\n",
|
| 189 |
+
"def embed_sequence(client: ESM3ForgeInferenceClient, sequence: str) -> LogitsOutput:\n",
|
| 190 |
+
" protein = ESMProtein(sequence=sequence)\n",
|
| 191 |
+
" protein_tensor = client.encode(protein)\n",
|
| 192 |
+
" if isinstance(protein_tensor, ESMProteinError):\n",
|
| 193 |
+
" raise protein_tensor\n",
|
| 194 |
+
" output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))\n",
|
| 195 |
+
" return output\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"\n",
|
| 198 |
+
"def save_emb(dir: str, df: pd.DataFrame, client: ESM3ForgeInferenceClient) -> None:\n",
|
| 199 |
+
" dir = os.path.expanduser(dir)\n",
|
| 200 |
+
" os.makedirs(dir, exist_ok=True)\n",
|
| 201 |
+
"\n",
|
| 202 |
+
" for i in tqdm(df.index, desc=\"Embedding sequences\"):\n",
|
| 203 |
+
" try:\n",
|
| 204 |
+
" output: LogitsOutput = embed_sequence(client=client, sequence=df.loc[i, 'sequence'])\n",
|
| 205 |
+
" embeddings_np: np.ndarray = output.embeddings.cpu().numpy()\n",
|
| 206 |
+
"\n",
|
| 207 |
+
" if not pd.isna(df.loc[i, 'SwissProt_ID']):\n",
|
| 208 |
+
" identifier = df.loc[i, 'SwissProt_ID']\n",
|
| 209 |
+
" elif not pd.isna(df.loc[i, 'Refseq_Accession']):\n",
|
| 210 |
+
" identifier = df.loc[i, 'Refseq_Accession']\n",
|
| 211 |
+
" elif not pd.isna(df.loc[i, 'Other_Accession']):\n",
|
| 212 |
+
" identifier = df.loc[i, 'Other_Accession']\n",
|
| 213 |
+
" else:\n",
|
| 214 |
+
" identifier = f\"unknown_{i}\"\n",
|
| 215 |
+
"\n",
|
| 216 |
+
" file_path: str = os.path.join(dir, f\"{identifier}.npy\")\n",
|
| 217 |
+
" np.save(file_path, embeddings_np)\n",
|
| 218 |
+
"\n",
|
| 219 |
+
" del output\n",
|
| 220 |
+
" gc.collect()\n",
|
| 221 |
+
" torch.cuda.empty_cache()\n",
|
| 222 |
+
"\n",
|
| 223 |
+
" except Exception as e:\n",
|
| 224 |
+
" print(f\"Error embedding index {i}: {e}\")"
|
| 225 |
+
]
|
| 226 |
+
},
|
| 227 |
+
{
|
| 228 |
+
"cell_type": "code",
|
| 229 |
+
"execution_count": null,
|
| 230 |
+
"id": "80db4990",
|
| 231 |
+
"metadata": {},
|
| 232 |
+
"outputs": [],
|
| 233 |
+
"source": [
|
| 234 |
+
"\n",
|
| 235 |
+
" \n",
|
| 236 |
+
"# Pass metadata_writer (and client if needed) to your function\n",
|
| 237 |
+
"save_emb(embeddings_dir, sequences,client = client)\n"
|
| 238 |
+
]
|
| 239 |
+
},
|
| 240 |
+
{
|
| 241 |
+
"cell_type": "code",
|
| 242 |
+
"execution_count": null,
|
| 243 |
+
"id": "77bf92c6",
|
| 244 |
+
"metadata": {},
|
| 245 |
+
"outputs": [],
|
| 246 |
+
"source": [
|
| 247 |
+
"sequences.loc[[11392]]"
|
| 248 |
+
]
|
| 249 |
+
},
|
| 250 |
+
{
|
| 251 |
+
"cell_type": "code",
|
| 252 |
+
"execution_count": 9,
|
| 253 |
+
"id": "365d9fdb",
|
| 254 |
+
"metadata": {},
|
| 255 |
+
"outputs": [],
|
| 256 |
+
"source": [
|
| 257 |
+
"sequences = sequences.drop(index=11392)"
|
| 258 |
+
]
|
| 259 |
+
},
|
| 260 |
+
{
|
| 261 |
+
"cell_type": "code",
|
| 262 |
+
"execution_count": null,
|
| 263 |
+
"id": "ad8a1990",
|
| 264 |
+
"metadata": {},
|
| 265 |
+
"outputs": [],
|
| 266 |
+
"source": [
|
| 267 |
+
"# Set up output directories and metadata file.\n",
|
| 268 |
+
"embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm600m/embeddings\")\n",
|
| 269 |
+
"os.makedirs(embeddings_dir, exist_ok=True)\n",
|
| 270 |
+
"client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_600m\").to(\"cuda\")"
|
| 271 |
+
]
|
| 272 |
+
},
|
| 273 |
+
{
|
| 274 |
+
"cell_type": "code",
|
| 275 |
+
"execution_count": null,
|
| 276 |
+
"id": "d42e5263",
|
| 277 |
+
"metadata": {},
|
| 278 |
+
"outputs": [],
|
| 279 |
+
"source": [
|
| 280 |
+
"save_emb(embeddings_dir, sequences,client = client)"
|
| 281 |
+
]
|
| 282 |
+
},
|
| 283 |
+
{
|
| 284 |
+
"cell_type": "code",
|
| 285 |
+
"execution_count": 2,
|
| 286 |
+
"id": "df91fc10",
|
| 287 |
+
"metadata": {},
|
| 288 |
+
"outputs": [],
|
| 289 |
+
"source": [
|
| 290 |
+
"def load_single_embedding(row, id_col, path):\n",
|
| 291 |
+
" try:\n",
|
| 292 |
+
" emb = np.load(os.path.join(path, f\"{row[id_col]}.npy\"))\n",
|
| 293 |
+
" emb = emb.squeeze(axis=0)\n",
|
| 294 |
+
" emb = np.mean(emb, axis=0)\n",
|
| 295 |
+
" return emb\n",
|
| 296 |
+
" except Exception as e:\n",
|
| 297 |
+
" print(f\"Error loading embedding {row[id_col]} due to {e}\")\n",
|
| 298 |
+
" return None\n",
|
| 299 |
+
"\n",
|
| 300 |
+
"def load_emb_parallel(df: pd.DataFrame, id_col: str, path: str, max_workers=None) -> list:\n",
|
| 301 |
+
" embeddings = []\n",
|
| 302 |
+
" with ProcessPoolExecutor(max_workers=max_workers) as executor:\n",
|
| 303 |
+
" futures = {\n",
|
| 304 |
+
" executor.submit(load_single_embedding, df.loc[i], id_col, path): i for i in df.index\n",
|
| 305 |
+
" }\n",
|
| 306 |
+
"\n",
|
| 307 |
+
" for future in tqdm(as_completed(futures), total=len(futures), desc=\"Loading embeddings\"):\n",
|
| 308 |
+
" emb = future.result()\n",
|
| 309 |
+
" if emb is not None:\n",
|
| 310 |
+
" embeddings.append(emb)\n",
|
| 311 |
+
" return embeddings\n",
|
| 312 |
+
"\n"
|
| 313 |
+
]
|
| 314 |
+
},
|
| 315 |
+
{
|
| 316 |
+
"cell_type": "code",
|
| 317 |
+
"execution_count": 5,
|
| 318 |
+
"id": "329701f6",
|
| 319 |
+
"metadata": {},
|
| 320 |
+
"outputs": [],
|
| 321 |
+
"source": [
|
| 322 |
+
"sequences['Preferred_ID'] = sequences['SwissProt_ID'].fillna(sequences['Refseq_Accession']).fillna(sequences['Other_Accession'])\n"
|
| 323 |
+
]
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"cell_type": "code",
|
| 327 |
+
"execution_count": 6,
|
| 328 |
+
"id": "9b720ff2",
|
| 329 |
+
"metadata": {},
|
| 330 |
+
"outputs": [
|
| 331 |
+
{
|
| 332 |
+
"name": "stderr",
|
| 333 |
+
"output_type": "stream",
|
| 334 |
+
"text": [
|
| 335 |
+
"Loading embeddings: 97%|█████████▋| 11377/11691 [05:32<00:10, 31.20it/s]"
|
| 336 |
+
]
|
| 337 |
+
},
|
| 338 |
+
{
|
| 339 |
+
"name": "stdout",
|
| 340 |
+
"output_type": "stream",
|
| 341 |
+
"text": [
|
| 342 |
+
"Error loading embedding Q9I120 due to [Errno 2] No such file or directory: '/home/jpuglia/Documentos/Tesis/datosGenerados/esm600m/embeddings/Q9I120.npy'\n"
|
| 343 |
+
]
|
| 344 |
+
},
|
| 345 |
+
{
|
| 346 |
+
"name": "stderr",
|
| 347 |
+
"output_type": "stream",
|
| 348 |
+
"text": [
|
| 349 |
+
"Loading embeddings: 100%|██████████| 11691/11691 [05:40<00:00, 34.29it/s]\n"
|
| 350 |
+
]
|
| 351 |
+
}
|
| 352 |
+
],
|
| 353 |
+
"source": [
|
| 354 |
+
"embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm600m/embeddings\")\n",
|
| 355 |
+
"embeddings = load_emb_parallel(sequences, 'Preferred_ID',embeddings_dir)"
|
| 356 |
+
]
|
| 357 |
+
},
|
| 358 |
+
{
|
| 359 |
+
"cell_type": "code",
|
| 360 |
+
"execution_count": 15,
|
| 361 |
+
"id": "765209e3",
|
| 362 |
+
"metadata": {},
|
| 363 |
+
"outputs": [
|
| 364 |
+
{
|
| 365 |
+
"name": "stdout",
|
| 366 |
+
"output_type": "stream",
|
| 367 |
+
"text": [
|
| 368 |
+
"Embeddings count: 11690\n",
|
| 369 |
+
"Sequences count: 11690\n"
|
| 370 |
+
]
|
| 371 |
+
}
|
| 372 |
+
],
|
| 373 |
+
"source": [
|
| 374 |
+
"print(f\"Embeddings count: {len(embeddings)}\")\n",
|
| 375 |
+
"print(f\"Sequences count: {len(sequences)}\")\n"
|
| 376 |
+
]
|
| 377 |
+
},
|
| 378 |
+
{
|
| 379 |
+
"cell_type": "code",
|
| 380 |
+
"execution_count": 17,
|
| 381 |
+
"id": "63bf7f6c",
|
| 382 |
+
"metadata": {},
|
| 383 |
+
"outputs": [
|
| 384 |
+
{
|
| 385 |
+
"data": {
|
| 386 |
+
"text/plain": [
|
| 387 |
+
"(1152,)"
|
| 388 |
+
]
|
| 389 |
+
},
|
| 390 |
+
"execution_count": 17,
|
| 391 |
+
"metadata": {},
|
| 392 |
+
"output_type": "execute_result"
|
| 393 |
+
}
|
| 394 |
+
],
|
| 395 |
+
"source": [
|
| 396 |
+
"embeddings[0].shape"
|
| 397 |
+
]
|
| 398 |
+
}
|
| 399 |
+
],
|
| 400 |
+
"metadata": {
|
| 401 |
+
"kernelspec": {
|
| 402 |
+
"display_name": "tesisEnv",
|
| 403 |
+
"language": "python",
|
| 404 |
+
"name": "python3"
|
| 405 |
+
},
|
| 406 |
+
"language_info": {
|
| 407 |
+
"codemirror_mode": {
|
| 408 |
+
"name": "ipython",
|
| 409 |
+
"version": 3
|
| 410 |
+
},
|
| 411 |
+
"file_extension": ".py",
|
| 412 |
+
"mimetype": "text/x-python",
|
| 413 |
+
"name": "python",
|
| 414 |
+
"nbconvert_exporter": "python",
|
| 415 |
+
"pygments_lexer": "ipython3",
|
| 416 |
+
"version": "3.10.16"
|
| 417 |
+
}
|
| 418 |
+
},
|
| 419 |
+
"nbformat": 4,
|
| 420 |
+
"nbformat_minor": 5
|
| 421 |
+
}
|
notebooks/ESMC_600m.ipynb
ADDED
|
@@ -0,0 +1,256 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 2,
|
| 6 |
+
"id": "c409c4ad",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"from esm.models.esmc import ESMC\n",
|
| 11 |
+
"from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput\n",
|
| 12 |
+
"from esm.sdk.forge import ESM3ForgeInferenceClient\n",
|
| 13 |
+
"from esm.sdk import batch_executor\n",
|
| 14 |
+
"import pandas as pd\n",
|
| 15 |
+
"import os\n",
|
| 16 |
+
"import csv\n",
|
| 17 |
+
"import numpy as np\n",
|
| 18 |
+
"import torch"
|
| 19 |
+
]
|
| 20 |
+
},
|
| 21 |
+
{
|
| 22 |
+
"cell_type": "code",
|
| 23 |
+
"execution_count": 3,
|
| 24 |
+
"id": "7f8f916c",
|
| 25 |
+
"metadata": {},
|
| 26 |
+
"outputs": [
|
| 27 |
+
{
|
| 28 |
+
"data": {
|
| 29 |
+
"text/html": [
|
| 30 |
+
"<div>\n",
|
| 31 |
+
"<style scoped>\n",
|
| 32 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 33 |
+
" vertical-align: middle;\n",
|
| 34 |
+
" }\n",
|
| 35 |
+
"\n",
|
| 36 |
+
" .dataframe tbody tr th {\n",
|
| 37 |
+
" vertical-align: top;\n",
|
| 38 |
+
" }\n",
|
| 39 |
+
"\n",
|
| 40 |
+
" .dataframe thead th {\n",
|
| 41 |
+
" text-align: right;\n",
|
| 42 |
+
" }\n",
|
| 43 |
+
"</style>\n",
|
| 44 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 45 |
+
" <thead>\n",
|
| 46 |
+
" <tr style=\"text-align: right;\">\n",
|
| 47 |
+
" <th></th>\n",
|
| 48 |
+
" <th>SwissProt_ID</th>\n",
|
| 49 |
+
" <th>Experimental_Localization</th>\n",
|
| 50 |
+
" <th>Organism</th>\n",
|
| 51 |
+
" <th>Phylum</th>\n",
|
| 52 |
+
" <th>Class</th>\n",
|
| 53 |
+
" <th>GramStain</th>\n",
|
| 54 |
+
" <th>Sequence</th>\n",
|
| 55 |
+
" </tr>\n",
|
| 56 |
+
" </thead>\n",
|
| 57 |
+
" <tbody>\n",
|
| 58 |
+
" <tr>\n",
|
| 59 |
+
" <th>0</th>\n",
|
| 60 |
+
" <td>P50307</td>\n",
|
| 61 |
+
" <td>Cytoplasmic</td>\n",
|
| 62 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 63 |
+
" <td>Firmicutes</td>\n",
|
| 64 |
+
" <td>Bacilli</td>\n",
|
| 65 |
+
" <td>1.0</td>\n",
|
| 66 |
+
" <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
|
| 67 |
+
" </tr>\n",
|
| 68 |
+
" <tr>\n",
|
| 69 |
+
" <th>1</th>\n",
|
| 70 |
+
" <td>P01552</td>\n",
|
| 71 |
+
" <td>Extracellular</td>\n",
|
| 72 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 73 |
+
" <td>Firmicutes</td>\n",
|
| 74 |
+
" <td>Bacilli</td>\n",
|
| 75 |
+
" <td>1.0</td>\n",
|
| 76 |
+
" <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
|
| 77 |
+
" </tr>\n",
|
| 78 |
+
" <tr>\n",
|
| 79 |
+
" <th>2</th>\n",
|
| 80 |
+
" <td>P09978</td>\n",
|
| 81 |
+
" <td>Extracellular</td>\n",
|
| 82 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 83 |
+
" <td>Firmicutes</td>\n",
|
| 84 |
+
" <td>Bacilli</td>\n",
|
| 85 |
+
" <td>1.0</td>\n",
|
| 86 |
+
" <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
|
| 87 |
+
" </tr>\n",
|
| 88 |
+
" <tr>\n",
|
| 89 |
+
" <th>3</th>\n",
|
| 90 |
+
" <td>P45723</td>\n",
|
| 91 |
+
" <td>Extracellular</td>\n",
|
| 92 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 93 |
+
" <td>Firmicutes</td>\n",
|
| 94 |
+
" <td>Bacilli</td>\n",
|
| 95 |
+
" <td>1.0</td>\n",
|
| 96 |
+
" <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
|
| 97 |
+
" </tr>\n",
|
| 98 |
+
" <tr>\n",
|
| 99 |
+
" <th>4</th>\n",
|
| 100 |
+
" <td>P81177</td>\n",
|
| 101 |
+
" <td>Extracellular</td>\n",
|
| 102 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 103 |
+
" <td>Firmicutes</td>\n",
|
| 104 |
+
" <td>Bacilli</td>\n",
|
| 105 |
+
" <td>1.0</td>\n",
|
| 106 |
+
" <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
|
| 107 |
+
" </tr>\n",
|
| 108 |
+
" </tbody>\n",
|
| 109 |
+
"</table>\n",
|
| 110 |
+
"</div>"
|
| 111 |
+
],
|
| 112 |
+
"text/plain": [
|
| 113 |
+
" SwissProt_ID Experimental_Localization Organism Phylum \\\n",
|
| 114 |
+
"0 P50307 Cytoplasmic Staphylococcus aureus Firmicutes \n",
|
| 115 |
+
"1 P01552 Extracellular Staphylococcus aureus Firmicutes \n",
|
| 116 |
+
"2 P09978 Extracellular Staphylococcus aureus Firmicutes \n",
|
| 117 |
+
"3 P45723 Extracellular Staphylococcus aureus Firmicutes \n",
|
| 118 |
+
"4 P81177 Extracellular Staphylococcus aureus Firmicutes \n",
|
| 119 |
+
"\n",
|
| 120 |
+
" Class GramStain Sequence \n",
|
| 121 |
+
"0 Bacilli 1.0 MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
|
| 122 |
+
"1 Bacilli 1.0 MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
|
| 123 |
+
"2 Bacilli 1.0 MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
|
| 124 |
+
"3 Bacilli 1.0 MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
|
| 125 |
+
"4 Bacilli 1.0 MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... "
|
| 126 |
+
]
|
| 127 |
+
},
|
| 128 |
+
"execution_count": 3,
|
| 129 |
+
"metadata": {},
|
| 130 |
+
"output_type": "execute_result"
|
| 131 |
+
}
|
| 132 |
+
],
|
| 133 |
+
"source": [
|
| 134 |
+
"sequences: pd.DataFrame = pd.read_csv('/home/jpuglia/Documentos/Tesis/tesisESM/Data/trainingData.csv')\n",
|
| 135 |
+
"sequences.head()"
|
| 136 |
+
]
|
| 137 |
+
},
|
| 138 |
+
{
|
| 139 |
+
"cell_type": "code",
|
| 140 |
+
"execution_count": null,
|
| 141 |
+
"id": "d7026979",
|
| 142 |
+
"metadata": {},
|
| 143 |
+
"outputs": [
|
| 144 |
+
{
|
| 145 |
+
"ename": "ValueError",
|
| 146 |
+
"evalue": "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().",
|
| 147 |
+
"output_type": "error",
|
| 148 |
+
"traceback": [
|
| 149 |
+
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 150 |
+
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
| 151 |
+
"\u001b[0;32m/tmp/ipykernel_118460/767462261.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0misfloat\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Sequence'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0msequences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;32mnot\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0misfloat\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
| 152 |
+
"\u001b[0;32m~/miniconda3/envs/tesisEnv/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1575\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mfinal\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1576\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__nonzero__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mNoReturn\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1577\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1578\u001b[0m \u001b[0;34mf\"The truth value of a {type(self).__name__} is ambiguous. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1579\u001b[0m \u001b[0;34m\"Use a.empty, a.bool(), a.item(), a.any() or a.all().\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1580\u001b[0m )\n",
|
| 153 |
+
"\u001b[0;31mValueError\u001b[0m: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()."
|
| 154 |
+
]
|
| 155 |
+
}
|
| 156 |
+
],
|
| 157 |
+
"source": [
|
| 158 |
+
"isfloat: bool = sequences['Sequence'].apply(lambda x:isinstance(x,float))\n",
|
| 159 |
+
"\n",
|
| 160 |
+
"sequences = sequences[~isfloat]"
|
| 161 |
+
]
|
| 162 |
+
},
|
| 163 |
+
{
|
| 164 |
+
"cell_type": "code",
|
| 165 |
+
"execution_count": null,
|
| 166 |
+
"id": "ea723ad9",
|
| 167 |
+
"metadata": {},
|
| 168 |
+
"outputs": [],
|
| 169 |
+
"source": [
|
| 170 |
+
"sequences = sequences.dropna()\n",
|
| 171 |
+
"sequences = sequences.drop_duplicates()\n",
|
| 172 |
+
"sequences.shape"
|
| 173 |
+
]
|
| 174 |
+
},
|
| 175 |
+
{
|
| 176 |
+
"cell_type": "code",
|
| 177 |
+
"execution_count": null,
|
| 178 |
+
"id": "07a49fd0",
|
| 179 |
+
"metadata": {},
|
| 180 |
+
"outputs": [],
|
| 181 |
+
"source": [
|
| 182 |
+
"torch.cuda.empty_cache()\n",
|
| 183 |
+
"client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_600m\").to(\"cuda\")"
|
| 184 |
+
]
|
| 185 |
+
},
|
| 186 |
+
{
|
| 187 |
+
"cell_type": "code",
|
| 188 |
+
"execution_count": null,
|
| 189 |
+
"id": "294c6798",
|
| 190 |
+
"metadata": {},
|
| 191 |
+
"outputs": [],
|
| 192 |
+
"source": [
|
| 193 |
+
"# Set up output directories and metadata file.\n",
|
| 194 |
+
"embeddings_dir = \"/home/jpuglia/Documentos/Tesis/datosGenerados/esm600m/embeddings\"\n",
|
| 195 |
+
"os.makedirs(embeddings_dir, exist_ok=True)\n",
|
| 196 |
+
"\n",
|
| 197 |
+
"def embed_sequence(client: ESM3ForgeInferenceClient, sequence: str) -> LogitsOutput:\n",
|
| 198 |
+
" \n",
|
| 199 |
+
" protein = ESMProtein(sequence=sequence)\n",
|
| 200 |
+
" protein_tensor = client.encode(protein)\n",
|
| 201 |
+
" if isinstance(protein_tensor, ESMProteinError):\n",
|
| 202 |
+
" raise protein_tensor\n",
|
| 203 |
+
" output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))\n",
|
| 204 |
+
" return output\n",
|
| 205 |
+
"\n",
|
| 206 |
+
"\n",
|
| 207 |
+
"def save_emb(dir: str, df : pd.DataFrame) -> None:\n",
|
| 208 |
+
" \n",
|
| 209 |
+
" for i in df.index:\n",
|
| 210 |
+
" \n",
|
| 211 |
+
" output: LogitsOutput = embed_sequence(client = client, sequence = df.loc[i, 'Sequence'])\n",
|
| 212 |
+
" \n",
|
| 213 |
+
" embeddings_np : np.ndarray = output.embeddings.cpu().numpy()\n",
|
| 214 |
+
" \n",
|
| 215 |
+
" file_path : str = os.path.join(dir,f\"{df.loc[i, 'SwissProt_ID']}.npy\") \n",
|
| 216 |
+
"\n",
|
| 217 |
+
" np.save(file_path, embeddings_np)\n",
|
| 218 |
+
" \n",
|
| 219 |
+
" del output\n",
|
| 220 |
+
" \n",
|
| 221 |
+
" torch.cuda.empty_cache()"
|
| 222 |
+
]
|
| 223 |
+
},
|
| 224 |
+
{
|
| 225 |
+
"cell_type": "code",
|
| 226 |
+
"execution_count": null,
|
| 227 |
+
"id": "80db4990",
|
| 228 |
+
"metadata": {},
|
| 229 |
+
"outputs": [],
|
| 230 |
+
"source": [
|
| 231 |
+
"save_emb(embeddings_dir, sequences)\n"
|
| 232 |
+
]
|
| 233 |
+
}
|
| 234 |
+
],
|
| 235 |
+
"metadata": {
|
| 236 |
+
"kernelspec": {
|
| 237 |
+
"display_name": "tesisEnv",
|
| 238 |
+
"language": "python",
|
| 239 |
+
"name": "python3"
|
| 240 |
+
},
|
| 241 |
+
"language_info": {
|
| 242 |
+
"codemirror_mode": {
|
| 243 |
+
"name": "ipython",
|
| 244 |
+
"version": 3
|
| 245 |
+
},
|
| 246 |
+
"file_extension": ".py",
|
| 247 |
+
"mimetype": "text/x-python",
|
| 248 |
+
"name": "python",
|
| 249 |
+
"nbconvert_exporter": "python",
|
| 250 |
+
"pygments_lexer": "ipython3",
|
| 251 |
+
"version": "3.10.16"
|
| 252 |
+
}
|
| 253 |
+
},
|
| 254 |
+
"nbformat": 4,
|
| 255 |
+
"nbformat_minor": 5
|
| 256 |
+
}
|
notebooks/EmbAnalisis.ipynb
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1e21abaa9bc06181ad40648ad354596985d284daada49adc7d9c0d17daa6bce5
|
| 3 |
+
size 10632399
|
notebooks/ProstT5.ipynb
ADDED
|
@@ -0,0 +1,526 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"cells": [
|
| 3 |
+
{
|
| 4 |
+
"cell_type": "code",
|
| 5 |
+
"execution_count": 1,
|
| 6 |
+
"id": "40b1e04a",
|
| 7 |
+
"metadata": {},
|
| 8 |
+
"outputs": [],
|
| 9 |
+
"source": [
|
| 10 |
+
"import pandas as pd\n",
|
| 11 |
+
"from transformers import T5Tokenizer, T5EncoderModel\n",
|
| 12 |
+
"import torch\n",
|
| 13 |
+
"import re\n",
|
| 14 |
+
"from tqdm.notebook import tqdm\n",
|
| 15 |
+
"import os\n",
|
| 16 |
+
"import numpy as np\n",
|
| 17 |
+
"import gc\n",
|
| 18 |
+
"\n",
|
| 19 |
+
"os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n"
|
| 20 |
+
]
|
| 21 |
+
},
|
| 22 |
+
{
|
| 23 |
+
"cell_type": "code",
|
| 24 |
+
"execution_count": 2,
|
| 25 |
+
"id": "f4c8ff50",
|
| 26 |
+
"metadata": {},
|
| 27 |
+
"outputs": [
|
| 28 |
+
{
|
| 29 |
+
"data": {
|
| 30 |
+
"text/html": [
|
| 31 |
+
"<div>\n",
|
| 32 |
+
"<style scoped>\n",
|
| 33 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 34 |
+
" vertical-align: middle;\n",
|
| 35 |
+
" }\n",
|
| 36 |
+
"\n",
|
| 37 |
+
" .dataframe tbody tr th {\n",
|
| 38 |
+
" vertical-align: top;\n",
|
| 39 |
+
" }\n",
|
| 40 |
+
"\n",
|
| 41 |
+
" .dataframe thead th {\n",
|
| 42 |
+
" text-align: right;\n",
|
| 43 |
+
" }\n",
|
| 44 |
+
"</style>\n",
|
| 45 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 46 |
+
" <thead>\n",
|
| 47 |
+
" <tr style=\"text-align: right;\">\n",
|
| 48 |
+
" <th></th>\n",
|
| 49 |
+
" <th>GramStain</th>\n",
|
| 50 |
+
" <th>Experimental_Localization</th>\n",
|
| 51 |
+
" <th>Phylum</th>\n",
|
| 52 |
+
" <th>Class</th>\n",
|
| 53 |
+
" <th>Organism</th>\n",
|
| 54 |
+
" <th>sequence</th>\n",
|
| 55 |
+
" <th>id</th>\n",
|
| 56 |
+
" </tr>\n",
|
| 57 |
+
" </thead>\n",
|
| 58 |
+
" <tbody>\n",
|
| 59 |
+
" <tr>\n",
|
| 60 |
+
" <th>0</th>\n",
|
| 61 |
+
" <td>Gram positive</td>\n",
|
| 62 |
+
" <td>Cytoplasmic</td>\n",
|
| 63 |
+
" <td>Firmicutes</td>\n",
|
| 64 |
+
" <td>Bacilli</td>\n",
|
| 65 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 66 |
+
" <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
|
| 67 |
+
" <td>P50307</td>\n",
|
| 68 |
+
" </tr>\n",
|
| 69 |
+
" <tr>\n",
|
| 70 |
+
" <th>1</th>\n",
|
| 71 |
+
" <td>Gram positive</td>\n",
|
| 72 |
+
" <td>Extracellular</td>\n",
|
| 73 |
+
" <td>Firmicutes</td>\n",
|
| 74 |
+
" <td>Bacilli</td>\n",
|
| 75 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 76 |
+
" <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
|
| 77 |
+
" <td>P01552</td>\n",
|
| 78 |
+
" </tr>\n",
|
| 79 |
+
" <tr>\n",
|
| 80 |
+
" <th>2</th>\n",
|
| 81 |
+
" <td>Gram positive</td>\n",
|
| 82 |
+
" <td>Extracellular</td>\n",
|
| 83 |
+
" <td>Firmicutes</td>\n",
|
| 84 |
+
" <td>Bacilli</td>\n",
|
| 85 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 86 |
+
" <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
|
| 87 |
+
" <td>P09978</td>\n",
|
| 88 |
+
" </tr>\n",
|
| 89 |
+
" <tr>\n",
|
| 90 |
+
" <th>3</th>\n",
|
| 91 |
+
" <td>Gram positive</td>\n",
|
| 92 |
+
" <td>Extracellular</td>\n",
|
| 93 |
+
" <td>Firmicutes</td>\n",
|
| 94 |
+
" <td>Bacilli</td>\n",
|
| 95 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 96 |
+
" <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
|
| 97 |
+
" <td>P45723</td>\n",
|
| 98 |
+
" </tr>\n",
|
| 99 |
+
" <tr>\n",
|
| 100 |
+
" <th>4</th>\n",
|
| 101 |
+
" <td>Gram positive</td>\n",
|
| 102 |
+
" <td>Extracellular</td>\n",
|
| 103 |
+
" <td>Firmicutes</td>\n",
|
| 104 |
+
" <td>Bacilli</td>\n",
|
| 105 |
+
" <td>Staphylococcus aureus</td>\n",
|
| 106 |
+
" <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
|
| 107 |
+
" <td>P81177</td>\n",
|
| 108 |
+
" </tr>\n",
|
| 109 |
+
" </tbody>\n",
|
| 110 |
+
"</table>\n",
|
| 111 |
+
"</div>"
|
| 112 |
+
],
|
| 113 |
+
"text/plain": [
|
| 114 |
+
" GramStain Experimental_Localization Phylum Class \\\n",
|
| 115 |
+
"0 Gram positive Cytoplasmic Firmicutes Bacilli \n",
|
| 116 |
+
"1 Gram positive Extracellular Firmicutes Bacilli \n",
|
| 117 |
+
"2 Gram positive Extracellular Firmicutes Bacilli \n",
|
| 118 |
+
"3 Gram positive Extracellular Firmicutes Bacilli \n",
|
| 119 |
+
"4 Gram positive Extracellular Firmicutes Bacilli \n",
|
| 120 |
+
"\n",
|
| 121 |
+
" Organism sequence \\\n",
|
| 122 |
+
"0 Staphylococcus aureus MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
|
| 123 |
+
"1 Staphylococcus aureus MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
|
| 124 |
+
"2 Staphylococcus aureus MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
|
| 125 |
+
"3 Staphylococcus aureus MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
|
| 126 |
+
"4 Staphylococcus aureus MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... \n",
|
| 127 |
+
"\n",
|
| 128 |
+
" id \n",
|
| 129 |
+
"0 P50307 \n",
|
| 130 |
+
"1 P01552 \n",
|
| 131 |
+
"2 P09978 \n",
|
| 132 |
+
"3 P45723 \n",
|
| 133 |
+
"4 P81177 "
|
| 134 |
+
]
|
| 135 |
+
},
|
| 136 |
+
"execution_count": 2,
|
| 137 |
+
"metadata": {},
|
| 138 |
+
"output_type": "execute_result"
|
| 139 |
+
}
|
| 140 |
+
],
|
| 141 |
+
"source": [
|
| 142 |
+
"sequences_df = pd.read_csv('../Data/trainingData.csv')\n",
|
| 143 |
+
"sequences_df['id'] = sequences_df['SwissProt_ID'].fillna(sequences_df['Refseq_Accession'].fillna(sequences_df['Other_Accession']))\n",
|
| 144 |
+
"sequences_df = sequences_df.drop(columns=['SwissProt_ID', 'Refseq_Accession', 'Other_Accession'])\n",
|
| 145 |
+
"sequences_df.head()"
|
| 146 |
+
]
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"cell_type": "code",
|
| 150 |
+
"execution_count": 3,
|
| 151 |
+
"id": "6925775b",
|
| 152 |
+
"metadata": {},
|
| 153 |
+
"outputs": [
|
| 154 |
+
{
|
| 155 |
+
"name": "stdout",
|
| 156 |
+
"output_type": "stream",
|
| 157 |
+
"text": [
|
| 158 |
+
"Secuencias 11691\n",
|
| 159 |
+
"Ids 11691\n"
|
| 160 |
+
]
|
| 161 |
+
}
|
| 162 |
+
],
|
| 163 |
+
"source": [
|
| 164 |
+
"sequences = list(sequences_df['sequence'].values)\n",
|
| 165 |
+
"accession = list(sequences_df['id'].values)\n",
|
| 166 |
+
"\n",
|
| 167 |
+
"print(f\"Secuencias {len(sequences)}\\nIds {len(accession)}\")"
|
| 168 |
+
]
|
| 169 |
+
},
|
| 170 |
+
{
|
| 171 |
+
"cell_type": "code",
|
| 172 |
+
"execution_count": 4,
|
| 173 |
+
"id": "c19ac1ba",
|
| 174 |
+
"metadata": {},
|
| 175 |
+
"outputs": [],
|
| 176 |
+
"source": [
|
| 177 |
+
"path = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/prost/embeddings\")"
|
| 178 |
+
]
|
| 179 |
+
},
|
| 180 |
+
{
|
| 181 |
+
"cell_type": "code",
|
| 182 |
+
"execution_count": 5,
|
| 183 |
+
"id": "5b5e321e",
|
| 184 |
+
"metadata": {},
|
| 185 |
+
"outputs": [
|
| 186 |
+
{
|
| 187 |
+
"name": "stderr",
|
| 188 |
+
"output_type": "stream",
|
| 189 |
+
"text": [
|
| 190 |
+
"You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
|
| 191 |
+
]
|
| 192 |
+
},
|
| 193 |
+
{
|
| 194 |
+
"data": {
|
| 195 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 196 |
+
"model_id": "17d989ac426c445dbfd209d0247a9a3d",
|
| 197 |
+
"version_major": 2,
|
| 198 |
+
"version_minor": 0
|
| 199 |
+
},
|
| 200 |
+
"text/plain": [
|
| 201 |
+
"Processing Sequences: 0%| | 0/11691 [00:00<?, ?it/s]"
|
| 202 |
+
]
|
| 203 |
+
},
|
| 204 |
+
"metadata": {},
|
| 205 |
+
"output_type": "display_data"
|
| 206 |
+
},
|
| 207 |
+
{
|
| 208 |
+
"name": "stdout",
|
| 209 |
+
"output_type": "stream",
|
| 210 |
+
"text": [
|
| 211 |
+
"Error CUDA out of memory. Tried to allocate 1.64 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.02 GiB is free. Including non-PyTorch memory, this process has 4.11 GiB memory in use. Of the allocated memory 4.00 GiB is allocated by PyTorch, and 10.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba CAC14227\n",
|
| 212 |
+
"Error CUDA out of memory. Tried to allocate 1.54 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.09 GiB is free. Including non-PyTorch memory, this process has 4.03 GiB memory in use. Of the allocated memory 3.89 GiB is allocated by PyTorch, and 36.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba P12255\n",
|
| 213 |
+
"Error CUDA out of memory. Tried to allocate 982.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 748.44 MiB is free. Including non-PyTorch memory, this process has 4.40 GiB memory in use. Of the allocated memory 4.25 GiB is allocated by PyTorch, and 51.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba P20471\n",
|
| 214 |
+
"Error CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 711.88 MiB is free. Including non-PyTorch memory, this process has 4.48 GiB memory in use. Of the allocated memory 4.33 GiB is allocated by PyTorch, and 44.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba A64556\n",
|
| 215 |
+
"Error CUDA out of memory. Tried to allocate 1.28 GiB. GPU 0 has a total capacity of 5.59 GiB of which 111.88 MiB is free. Including non-PyTorch memory, this process has 5.07 GiB memory in use. Of the allocated memory 4.90 GiB is allocated by PyTorch, and 67.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba AAF25576\n",
|
| 216 |
+
"Error CUDA out of memory. Tried to allocate 1.55 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 4.03 GiB memory in use. Of the allocated memory 3.91 GiB is allocated by PyTorch, and 19.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q4L9P0\n",
|
| 217 |
+
"Error CUDA out of memory. Tried to allocate 1.04 GiB. GPU 0 has a total capacity of 5.59 GiB of which 591.88 MiB is free. Including non-PyTorch memory, this process has 4.60 GiB memory in use. Of the allocated memory 4.45 GiB is allocated by PyTorch, and 40.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I5N6\n",
|
| 218 |
+
"Error CUDA out of memory. Tried to allocate 1.49 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.22 GiB is free. Including non-PyTorch memory, this process has 3.95 GiB memory in use. Of the allocated memory 3.84 GiB is allocated by PyTorch, and 5.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I791\n",
|
| 219 |
+
"Error CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 31.88 MiB is free. Including non-PyTorch memory, this process has 5.14 GiB memory in use. Of the allocated memory 5.01 GiB is allocated by PyTorch, and 36.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I120\n"
|
| 220 |
+
]
|
| 221 |
+
}
|
| 222 |
+
],
|
| 223 |
+
"source": [
|
| 224 |
+
"# Setup device\n",
|
| 225 |
+
"device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
|
| 226 |
+
"\n",
|
| 227 |
+
"# Load tokenizer and model\n",
|
| 228 |
+
"tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)\n",
|
| 229 |
+
"model = T5EncoderModel.from_pretrained(\"Rostlab/ProstT5\").to(device)\n",
|
| 230 |
+
"model.full() if device == 'cpu' else model.half()\n",
|
| 231 |
+
"\n",
|
| 232 |
+
"# Clean sequences\n",
|
| 233 |
+
"sequences = [\" \".join(list(re.sub(r\"[UZOB]\", \"X\", s))) for s in sequences]\n",
|
| 234 |
+
"sequences = [ \"<AA2fold> \" + s for s in sequences]\n",
|
| 235 |
+
"\n",
|
| 236 |
+
"# Process each sequence individually\n",
|
| 237 |
+
"for i, (seq, acc_id) in enumerate(tqdm(zip(sequences, accession), total=len(sequences), desc=\"Processing Sequences\")):\n",
|
| 238 |
+
" try:\n",
|
| 239 |
+
" # Tokenize\n",
|
| 240 |
+
" ids = tokenizer(\n",
|
| 241 |
+
" seq,\n",
|
| 242 |
+
" add_special_tokens=True,\n",
|
| 243 |
+
" return_tensors='pt'\n",
|
| 244 |
+
" ).to(device)\n",
|
| 245 |
+
"\n",
|
| 246 |
+
" # Forward pass\n",
|
| 247 |
+
" with torch.no_grad():\n",
|
| 248 |
+
" embedding_repr = model(\n",
|
| 249 |
+
" ids.input_ids,\n",
|
| 250 |
+
" attention_mask=ids.attention_mask\n",
|
| 251 |
+
" )\n",
|
| 252 |
+
"\n",
|
| 253 |
+
" # Compute actual length (excluding prefix)\n",
|
| 254 |
+
" real_len = ids.attention_mask[0].sum().item() - 1\n",
|
| 255 |
+
"\n",
|
| 256 |
+
" # Extract and average embeddings\n",
|
| 257 |
+
" emb = embedding_repr.last_hidden_state[0, 1:real_len]\n",
|
| 258 |
+
" emb_avg = emb.mean(dim=0).cpu().numpy()\n",
|
| 259 |
+
"\n",
|
| 260 |
+
" # Save embedding using accession ID\n",
|
| 261 |
+
" np.save(os.path.join(path, f\"{acc_id}.npy\"), emb_avg)\n",
|
| 262 |
+
"\n",
|
| 263 |
+
"\n",
|
| 264 |
+
" # Cleanup\n",
|
| 265 |
+
" del ids, embedding_repr, emb, emb_avg\n",
|
| 266 |
+
" torch.cuda.empty_cache()\n",
|
| 267 |
+
" gc.collect()\n",
|
| 268 |
+
"\n",
|
| 269 |
+
" except RuntimeError as e:\n",
|
| 270 |
+
" print(f\"Error {e} mientras se procesaba {acc_id}\")\n",
|
| 271 |
+
"\n"
|
| 272 |
+
]
|
| 273 |
+
},
|
| 274 |
+
{
|
| 275 |
+
"cell_type": "code",
|
| 276 |
+
"execution_count": 6,
|
| 277 |
+
"id": "9ca9cb2d",
|
| 278 |
+
"metadata": {},
|
| 279 |
+
"outputs": [
|
| 280 |
+
{
|
| 281 |
+
"data": {
|
| 282 |
+
"text/html": [
|
| 283 |
+
"<div>\n",
|
| 284 |
+
"<style scoped>\n",
|
| 285 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
| 286 |
+
" vertical-align: middle;\n",
|
| 287 |
+
" }\n",
|
| 288 |
+
"\n",
|
| 289 |
+
" .dataframe tbody tr th {\n",
|
| 290 |
+
" vertical-align: top;\n",
|
| 291 |
+
" }\n",
|
| 292 |
+
"\n",
|
| 293 |
+
" .dataframe thead th {\n",
|
| 294 |
+
" text-align: right;\n",
|
| 295 |
+
" }\n",
|
| 296 |
+
"</style>\n",
|
| 297 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
| 298 |
+
" <thead>\n",
|
| 299 |
+
" <tr style=\"text-align: right;\">\n",
|
| 300 |
+
" <th></th>\n",
|
| 301 |
+
" <th>GramStain</th>\n",
|
| 302 |
+
" <th>Experimental_Localization</th>\n",
|
| 303 |
+
" <th>Phylum</th>\n",
|
| 304 |
+
" <th>Class</th>\n",
|
| 305 |
+
" <th>Organism</th>\n",
|
| 306 |
+
" <th>sequence</th>\n",
|
| 307 |
+
" <th>id</th>\n",
|
| 308 |
+
" </tr>\n",
|
| 309 |
+
" </thead>\n",
|
| 310 |
+
" <tbody>\n",
|
| 311 |
+
" <tr>\n",
|
| 312 |
+
" <th>1532</th>\n",
|
| 313 |
+
" <td>Gram negative</td>\n",
|
| 314 |
+
" <td>OuterMembrane,Extracellular</td>\n",
|
| 315 |
+
" <td>Proteobacteria</td>\n",
|
| 316 |
+
" <td>Gammaproteobacteria</td>\n",
|
| 317 |
+
" <td>Yersinia pestis</td>\n",
|
| 318 |
+
" <td>MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQ...</td>\n",
|
| 319 |
+
" <td>CAC14227</td>\n",
|
| 320 |
+
" </tr>\n",
|
| 321 |
+
" <tr>\n",
|
| 322 |
+
" <th>1683</th>\n",
|
| 323 |
+
" <td>Gram negative</td>\n",
|
| 324 |
+
" <td>OuterMembrane</td>\n",
|
| 325 |
+
" <td>Proteobacteria</td>\n",
|
| 326 |
+
" <td>Betaproteobacteria</td>\n",
|
| 327 |
+
" <td>Bordetella pertussis</td>\n",
|
| 328 |
+
" <td>MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATS...</td>\n",
|
| 329 |
+
" <td>P12255</td>\n",
|
| 330 |
+
" </tr>\n",
|
| 331 |
+
" <tr>\n",
|
| 332 |
+
" <th>1767</th>\n",
|
| 333 |
+
" <td>Gram negative</td>\n",
|
| 334 |
+
" <td>CytoplasmicMembrane</td>\n",
|
| 335 |
+
" <td>Proteobacteria</td>\n",
|
| 336 |
+
" <td>Alphaproteobacteria</td>\n",
|
| 337 |
+
" <td>Sinorhizobium meliloti</td>\n",
|
| 338 |
+
" <td>MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKG...</td>\n",
|
| 339 |
+
" <td>P20471</td>\n",
|
| 340 |
+
" </tr>\n",
|
| 341 |
+
" <tr>\n",
|
| 342 |
+
" <th>4089</th>\n",
|
| 343 |
+
" <td>Gram negative</td>\n",
|
| 344 |
+
" <td>OuterMembrane,Extracellular</td>\n",
|
| 345 |
+
" <td>Proteobacteria</td>\n",
|
| 346 |
+
" <td>Epsilonproteobacteria</td>\n",
|
| 347 |
+
" <td>Helicobacter pylori</td>\n",
|
| 348 |
+
" <td>MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLW...</td>\n",
|
| 349 |
+
" <td>A64556</td>\n",
|
| 350 |
+
" </tr>\n",
|
| 351 |
+
" <tr>\n",
|
| 352 |
+
" <th>4623</th>\n",
|
| 353 |
+
" <td>Gram positive</td>\n",
|
| 354 |
+
" <td>Cellwall</td>\n",
|
| 355 |
+
" <td>Firmicutes</td>\n",
|
| 356 |
+
" <td>Bacilli</td>\n",
|
| 357 |
+
" <td>Lactobacillus reuteri</td>\n",
|
| 358 |
+
" <td>MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGAT...</td>\n",
|
| 359 |
+
" <td>AAF25576</td>\n",
|
| 360 |
+
" </tr>\n",
|
| 361 |
+
" </tbody>\n",
|
| 362 |
+
"</table>\n",
|
| 363 |
+
"</div>"
|
| 364 |
+
],
|
| 365 |
+
"text/plain": [
|
| 366 |
+
" GramStain Experimental_Localization Phylum \\\n",
|
| 367 |
+
"1532 Gram negative OuterMembrane,Extracellular Proteobacteria \n",
|
| 368 |
+
"1683 Gram negative OuterMembrane Proteobacteria \n",
|
| 369 |
+
"1767 Gram negative CytoplasmicMembrane Proteobacteria \n",
|
| 370 |
+
"4089 Gram negative OuterMembrane,Extracellular Proteobacteria \n",
|
| 371 |
+
"4623 Gram positive Cellwall Firmicutes \n",
|
| 372 |
+
"\n",
|
| 373 |
+
" Class Organism \\\n",
|
| 374 |
+
"1532 Gammaproteobacteria Yersinia pestis \n",
|
| 375 |
+
"1683 Betaproteobacteria Bordetella pertussis \n",
|
| 376 |
+
"1767 Alphaproteobacteria Sinorhizobium meliloti \n",
|
| 377 |
+
"4089 Epsilonproteobacteria Helicobacter pylori \n",
|
| 378 |
+
"4623 Bacilli Lactobacillus reuteri \n",
|
| 379 |
+
"\n",
|
| 380 |
+
" sequence id \n",
|
| 381 |
+
"1532 MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQ... CAC14227 \n",
|
| 382 |
+
"1683 MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATS... P12255 \n",
|
| 383 |
+
"1767 MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKG... P20471 \n",
|
| 384 |
+
"4089 MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLW... A64556 \n",
|
| 385 |
+
"4623 MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGAT... AAF25576 "
|
| 386 |
+
]
|
| 387 |
+
},
|
| 388 |
+
"execution_count": 6,
|
| 389 |
+
"metadata": {},
|
| 390 |
+
"output_type": "execute_result"
|
| 391 |
+
}
|
| 392 |
+
],
|
| 393 |
+
"source": [
|
| 394 |
+
"cpu_ids = [\n",
|
| 395 |
+
" 'CAC14227',\n",
|
| 396 |
+
" 'P12255',\n",
|
| 397 |
+
" 'P20471',\n",
|
| 398 |
+
" 'A64556',\n",
|
| 399 |
+
" 'AAF25576',\n",
|
| 400 |
+
" 'Q4L9P0',\n",
|
| 401 |
+
" 'Q9I5N6',\n",
|
| 402 |
+
" 'Q9I791',\n",
|
| 403 |
+
" 'Q9I120'\n",
|
| 404 |
+
"]\n",
|
| 405 |
+
"\n",
|
| 406 |
+
"cpu_sequences = sequences_df[sequences_df['id'].isin(cpu_ids)]\n",
|
| 407 |
+
"cpu_sequences.head()\n"
|
| 408 |
+
]
|
| 409 |
+
},
|
| 410 |
+
{
|
| 411 |
+
"cell_type": "code",
|
| 412 |
+
"execution_count": 7,
|
| 413 |
+
"id": "a919beeb",
|
| 414 |
+
"metadata": {},
|
| 415 |
+
"outputs": [
|
| 416 |
+
{
|
| 417 |
+
"name": "stdout",
|
| 418 |
+
"output_type": "stream",
|
| 419 |
+
"text": [
|
| 420 |
+
"['MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQSMVSKLFRKNLLALSLGSIVFLSTGPVFAADITVSTQAELSAALSNGTYDKIILGADITLIGSLTVNMTSNQVVIDGQGKFGLTVNNTTNYGLVVSSGSGTLTLQNMSKIDSANYYSMVVLNGANTAVNVIYNNIDFLGSSQLIYMGAYGAATNSIMTFGDILNDVVVNDRAQEIGEVNKLAFTGRFHVTHTGSSVTSFVSTGGANNTSTMDFASGADVKIDRTGSTGDLTSTGVNAFAYTFADGASFELIANQNVFSGTTTNRGLEIGSYNSIDGFGSGVKIVLQSRSDGSIISGNGIDNATTNAAGINNNASGDANVIYNLGTGSILKATNTGILATKNANNASDIYIRSAGDITAATGISATHNGTGTVKIKNDGTITSTTAGIAISSASIKEISVDNTDGTITATAGTGVNVLASAILNLFGGTINTSATANGITFAGTEGGHTLTDLTINLLGTGIALSNVAGVNLTLSNVTLNTLNGTALNSLTGLTLVDSLNGRNTINIEGAGIGIAATNTELNTFDAEALDINVNGAGIGIQATGGGVNLSASNLIINVANTLGTALQITDGIDNTTTIGNEIQLNAENATAINFLGSSSKTLNNNGTIKGSVIFAGVADHIINNNGTLDGTLTTGAGNDTLVLDSSSQSNDVINLGDGNNSVTIQNGATVSSIITGNGNDTFTINGMSVGSTYLGSLDAGTGLNTXNXXASTDELAAATSLQGFTNINLVDSHITLVSDDNIGSGMVNIDSSSELLFGSTFDGILHATLGAGTGSAIVNNSANVSLEQASMFAGTWQVNQGGALTASNSNQLGSAKIGLDGTLNLDNIALFNHVLTGNGTLNVAKNLATTAFDFGSTVGGAFSGIVNLTKTTFALSADNAAALASATLKLSDDSVTTVGTTDRTLHGLDLSGGTLIFDGAVPQSQTSGVVTVTDLALNSGTVNITGSGSWDNTDPLATNVSILEQDRAGSTLELINATNVTGDIDALDLLVNGTAITSGTQGVQSAIQQGGSTVANAIHNYGLASSNSNGDSGLYVNYTLSALELLADGADALLLATESGLTANRVLNAELFGVGGLVVDAQNGALTLANGSNRYEGTTTVTAGELILGANGAFGQTSLLDIASGASANINGYSQTVGAVTNVGTVTLGSGGVLTSGLLTNGGILDLTGGALNLTXGGASTVAGGLTGAGTLNINGGNLSVSAANSGLSGQTHIADVASVTLTDTGTLGTSAVEVLGTLNLNGANAAMTNVLSGDGTINTNAAVTLSGNNSFSGAHQIGTDGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGAGDTLSLSGFNGTFGNSVTGSGVLQVTDDAEVTLTSSNGVSNAVTIDIADATLNLDDIALFNHVLTGNGLLNVAKNDASTAFDFGSTVGGAFSGIVNLTNTTFALSADNAAALARATLKLSDDSVTTVGATDRTLHGLDLNGGTLIFDGSPPQSQANGVVTVTDLALNSGTISITGAGNWENEHPVTPPNVSLLEQDRGDILLELINAANVTGNANNLDLLVDGTAITSGTQGVESAIQQGGSTVANAIHNYGLTSSNGNGGSGLYVNYTLSALELLANGANALLLATESGLTANRVLNAELFGVGGLVVDAQNGALTLANGNNRYEGTTTVTAGELILGANGAFGQTSLLNIASGASANINGYRQTVGAVTNSGAVTLGNGGVLTSGLLTNGGILDLTGGALNLAAGGSSTVAGGLTGAGTLNINGGDLAVSATNSGLSGQTHIADVASVTLTGTGTLGTSAVEVLGTLNLNGANAAMTNVLSGGGVINTNAAVTLSGNNSFSGAHQIGTDGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGAGDTLSLSGFNXTFGNSVTGSGVLQVTDDAEVTLTSSNXVGNTVKVDIADATLYVNDIALLDHVLTENGTLNVAKYLATTAFDXGSTVGXXFSGIVNLTNTTFALSADNAAALARATLKLSDDSVTTVGTTDRILHGLDLNGGTLIFDGSPPQSQANGVVTVTDLALNSGTISITGAGNWENEHPVTPPNVSLLEQDRGDILLQLIDADNVTGNANDLELMINGTTISAGQGVQSTVQQGGYTVANATHNYGMTSNGGSGLYVNYTLSALELLADGANALLLATESGLTANRELNAELSGVGGLVVDAQNGALTLANGNNRYEGTTTVTAGELILGANGAFGQTSLLNIASGASANINGYRQTVGAVTNTGTVTLGNGGELTSTDTLINTGMINVTDGILNLENGGASSISGGLTGNGILNIKGGDFTISIDNNGLAGQTNISDGASVTLGNGGTIIGTGNLGSSVIDVLGDLNLVADNSLANVISGDGTINTTATVTLSGNSSFSGAHQIGTNGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGTGDTLSLSGFNGTFGNSVTGSGVLQVTDDAEVTLTSSNGVSNAVTIDIADATLNLDDIALFNHALTGNGLLNVAKNDASTAFDFGATVGGAFTGTVNLNNSTFDLSGNNTTVLAQATLKLSSGNLTSVGNGVQNIGTLAMNGGTLLFDNIVDNAGIITSDGTIAANSINTTGGGEVRVNLPSNLAPSLDGLSVMELDEGEIIVTLATGAATGTGHELTLTDENGDPISAVTYQGVHNAGSTSAAATGSFNYGMTTGEDYDGLYVNYGLTALELLSTGSEALVLTAILANNGTQSNDLSAQITGSGDLAFASANDGSTASLSNSTNSYTGTTWVSSGNLRLDADSALGQTSLLAMSTATHVDINGTQQVVGELATEGGSTLDLNDGKLTVTGGGQIDGALTGGGELVLSGGLLNVSYDNAGFTGSTDIANGAVAHLSQAQGLGNGTINNNGTLHLDNTIGTLFNALTGSDGEVLLSNNASVQLAGDNSGYSGLFTNQAGSILIANSAEHLGGSSIANSGALILDTGSVWELTNTISGTGTLVKRGSGTVKIEGDTVSAGLTTIEEGLLQLGSSAVTQTLSLEESLQERALLVSFASNMANLTSNVLITANGSLGGYGQVTGNVENYGNLIMPNALTGGDFGTFTIDGNYTGDEGMITFNTILAGDTSVTDRLVITGDTAGQSYVTVNNIGGVGARTFEGIKIIDVGGDSAGQFTLNGRAVGGAYEYFLYQGGASTPDDGNWYLRTEADDRRPEPASYTANLAAANNMFVTSLADRMGETLYTDVFTGEQKTTSLWLRNEGSHNRSRDDSGELKTQDNRYVMQLGGDVAQWSRNAQDLWRVGVMAGYANSSSSTVAQVAGYRSTGSVDGYSVGIYGSWLADNADDTGAYVDSWVQYSWFDNRVSGQDLATEKYDSKGFTASVEGGYAFKVGESVNQSYFIQPKAQVVWMGVKADDHTETNGTVISGDGNGNIQTRLGAKAFINPSDKAKVSGPAFKPFVEANWIHNTKDFGTTLDGVTVKQAGTANIAELKLGVDGQVNSQLNLWGNIGQQVGNKGYSETSVVLGVKYNF', 'MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATSLSVAPNALAWALMLACTGLPLVTHAQGLVPQGQTQVLQGGNKVPVVNIADPNSGGVSHNKFQQFNVANPGVVFNNGLTDGVSRIGGALTKNPNLTRQASAILAEVTDTSPSRLAGTLEVYGKGADLIIANPNGISVNGLSTLNASNLTLTTGRPSVNGGRIGLDVQQGTVTIERGGVNATGLGYFDVVARLVKLQGAVSSKQGKPLADIAVVAGANRYDHATRRATPIAAGARGAAAGAYAIDGTAAGAMYGKHITLVSSDSGLGVRQLGSLSSPSAITVSSQGEIALGDATVQRGPLSLKGAGVVSAGKLASGGGAVNVAGGGAVKIASASSVGNLAVQGGGKVQATLLNAGGTLLVSGRQAVQLGAASSRQALSVNAGGALKADKLSATRRVDVDGKQAVALGSASSNALSVRAGGALKAGKLSATGRLDVDGKQAVTLGSVASDGALSVSAGGNLRAKQLVSSAQLEVRGQREVALDDASSARGMTVVAAGALAARNLQSKGAIGVQGGEAVSVANANSDAELRVRGRGQVDLHDLSAARGADISGEGRVNIGRARSDSDVKVSAHGALSIDSMTALGAIGVQAGGSVSAKDMRSRGAVTVSGGGAVNLGDVQSDGQVRATSAGAMTVRDVAAAADLALQAGDALQAGFLKSAGAMTVNGRDAVRLDGAHAGGQLRVSSDGQAALGSLAAKGELTVSAARAATVAELKSLDNISVTGGERVSVQSVNSASRVAISAHGALDVGKVSAKSGIGLEGWGAVGADSLGSDGAISVSGRDAVRVDQARSLADISLGAEGGATLGAVEAAGSIDVRGGSTVAANSLHANRDVRVSGKDAVRVTAATSGGGLHVSSGRQLDLGAVQARGALALDGGAGVALQSAKASGTLHVQGGEHLDLGTLAAVGAVDVNGTGDVRVAKLVSDAGADLQAGRSMTLGIVDTTGDLQARAQQKLELGSVKSDGGLQAAAGGALSLAAAEVAGALELSGQGVTVDRASASRARIDSTGSVGIGALKAGAVEAASPRRARRALRQDFFTPGSVVVRAQGNVTVGRGDPHQGVLAQGDIIMDAKGGTLLLRNDALTENGTVTISADSAVLEHSTIESKISQSVLAAKGDKGKPAVSVKVAKKLFLNGTLRAVNDNNETMSGRQIDVVDGRPQITDAVTGEARKDESVVSDAALVADGGPIVVEAGELVSHAGGIGNGRNKENGASVTVRTTGNLVNKGYISAGKQGVLEVGGALTNEFLVGSDGTQRIEAQRIENRGTFQSQAPAGTAGALVVKAAEAIVHDGVMATKGEMQIAGKGGGSPTVTAGAKATTSANKLSVDVASWDNAGSLDIKKGGAQVTVAGRYAEHGEVSIQGDYTVSADAIALAAQVTQRGGAANLTSRHDTRFSNKIRLMGPLQVNAGGAVSNTGNLKVREGVTVTAASFDNETGAEVMAKSATLTTSGAARNAGKMQVKEAATIVAASVSNPGTFTAGKDITVTSRGGFDNEGKMESNKDIVIKTEQFSNGRVLDAKHDLTVTASGQADNRGSLKAGHDFTVQAQRIDNSGTMAAGHDATLKAPHLRNTGQVVAGHDIHIINSAKLENTGRVDARNDIALDVADFTNTGSLYAEHDATLTLAQGTQRDLVVDQDHILPVAEGTLRVKAKSLTTEIETGNPGSLIAEVQENIDNKQAIVVGKDLTLSSAHGNVANEANALLWAAGELTVKAQNITNKRAALIEAGGNARLTAAVALLNKLGRIRAGEDMHLDAPRIENTAKLSGEVQRKGVQDVGGGEHGRWSGIGYVNYWLRAGNGKKAGTIAAPWYGGDLTAEQSLIEVGKDLYLNAGARKDEHRHLLNEGVIQAGGHGHIGGDVDNRSVVRTVSAMEYFKTPLPVSLTALDNRAGLSPATWNFQSTYELLDYLLDQNRYEYIWGLYPTYTEWSVNTLKNLDLGYQAKPAPTAPPMPKAPELDLRGHTLESAEGRKIFGEYKKLQGEYEKAKMAVQAVEAYGEATRRVHDQLGQRYGKALGGMDAETKEVDGIIQEFAADLRTVYAKQADQATIDAETDKVAQRYKSQIDAVRLQAIQPGRVTLAKALSAALGADWRALGHSQLMQRWKDFKAGKRGAEIAFYPKEQTVLAAGAGLTLSNGAIHNGENAAQNRGRPEGLKIGAHSATSVSGSFDALRDVGLEKRLDIDDALAAVLVNPHIFTRIGAAQTSLADGAAGPALARQARQAPETDGMVDARGLGSADALASLASLDAAQGLEVSGRRNAQVADAGLAGPSAVAAPAVGAADVGVEPVTGDQVDQPVVAVGLEQPVATVRVAPPAVALPRPLFETRIKFIDQSKFYGSRYFFEQIGYKPDRAARVAGDNYFDTTLVREQVRRALGGYESRLPVRGVALVAKLMDSAGTVGKALGLKVGVAPTAQQLKQADRDFVWYVDTVIDGQKVLAPRLYLTEATRQGITDQYAGGGALIASGGDVTVNTDGHDVSSVNGLIQGRSVKVDAGKGKVVVADSKGAGGGIEADDEVDVSGRDIGIEGGKLRGKDVRLKADTVKVATSMRYDDKGRLAARGDGALDAQGGQLHIEAKRLETAGATLKGGKVKLDVDDVKLGGVYEAGSSYENKSSTPLGSLFAILSSTTETNQSAHANHYGTRIEAGTLEGKMQNLEIEGGSVDAAHTDLSVARDARFKAAADFAHAEHEKDVRQLSLGAKVGAGGYEAGFSLGSESGLEAHAGRGMTAGAEVKVGYRASHEQSSETEKSYRNANLNFGGGSVEAGNVLDIGGADINRNRYGGAAKGNAGTEEALRMRAKKVESTKYVSEQTSQSSGWSVEVASTASARSSLLTAATRLGDSVAQNVEDGREIRGELMAAQVAAEATQLVTADTAAVALSAGISADFDSSHSRSTSQNTQYLGGNLSIEATEGDATLVGAKFGGGDQVSLKAAKSVNLMAAESTFESYSESHNFHASADANLGANAVQGAVGLGLTAGMGTSHQITNETGKTYAGTSVDAANVSIDAGKDLNLSGSRVRGKHVVLDVEGDINATSKQDERNYNSSGGGWDASAGVAIQNRTLVAPVGSAGFNFNTEHDNSRLTNDGAAGVVASDGLTGHVKGDANLTGATIADLSGKGNLKVDGAVNAQNLKDYRDKDGGSGGLNVGISSTTLAPTVGVAFGRVAGEDYQAEQRATIDVGQTKDPARLQVGGGVKGTLNQDAAQATVVQRNKHWAGGGSEFSVAGKSLKKKNQVRPVETPTPDVVDGPPSRPTTPPASPQPIRATVEVSSPPPVSVATVEVVPRPKVETAQPLPPRPVAAQVVPVTPPKVEVAKVEVVPRPKVETAQPLPPRPVVAEKVTTPAVQPQLAKVETVQPVKPETTKPLPKPLPVAKVTKAPPPVVETAQPLPPVKPQKATPGPVAEVGKATVTTVQVQSAPPKPAPVAKQPAPAPKPKPKPKPKAERPKPGKTTPLSGRHVVQQQVQVLQRQASDINNTKSLPGGKLPKPVTVKLTDENGKPQTYTINRREDLMKLNGKVLSTKTTLGLEQTFRLRVEDIGGKNYRVFYETNK', 'MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKGTSALPGFFPFEFRARHRENEKEILRVYRATAADVEAGASITPAAEWLLDNHHVVEEAIQEVRRDFPRRFYRQLPTLSVSGTVIPRTMALAWLYVAHTHSTVTRESITAMVEGFQEHETLKIGELWALPSILRFVLIENLRRIAIRVERSRGMRRKANEVADQLIRLNDPEGCRTLLVESEALAADNTFIAQLLYRMRDGSQSSGAVIAWIEERLERRGTDVEEALVAEQNRLSSGNATMSNIIRSLREIDDTDWAVWFESVSKIDATLREGSDYAALDFGSRNTYRDTIEKLARRSGHSEHEVTEIAIEMVEEAKAAAAVEAPLQEPNVGSFLVGKQRLALEKRIGYSPSIFQHLIRSVRKLDWFAIAGPNILLTILAMIVVYAFVSPMDIPSGAKLIMLLLFALPASEGAMGLFNTVFTLFAKPSRLVGYEFLDGIPEDARTLVVVPCLIAKRDHVDELVRNLEVHYLANPRGEIYFALLSDWADSKSEEAPADTDVLEYAKREIASLSARYAYDGKTRFFLLHRRRLYNEAEGVWMGWERKRGKLHELNLLLRGDRDTSFLQGANMVPEGVQYVMTLDSDTRLMRDAVTKLVGKLYHPINRPVVNPRTQEVVTGYSLLQPRVTPSLTTGSEASAFQRIFTINRGIDPYVFTVSDVYQDIAGEGSFTGKGLYHVDAFEAALKSRIEENAVLSHDLLEGSYARCALVTDIELVEDFPIRYEVEMSRQHRWARGDWQLLPYIFNPKNGLSMLGRWKMYDNLRRSLIPVAWLAASVMGWYYMEPTPALIWQLVLIFSLFVAPTLSLISGIMPRRNDIVARAHLHTVLSDIRAANAQVALRIVFIAHNAAMMADAIVRSLYRTFVSRKLMLEWRTAAQVQSAGHGSIGDYFRAMWTAPALALVSLALAAISDTGLPFIGLPFALIWAASPAVAWFVSQSAETEDQLVVSEEAIEEMRKIARRTWRYFEAFVTAEQNFLPPDNFQETPQPVLAERTSPTNIGVYLLSVMSARSFGWIGFEETITRLEQTIATIDRMPKYRGHLFNWYRTRGLEPMEPRYVSSVDSGNLAGHLIAVSSMCREWAEAPSAHVQGNLDGIGDVAAILKEALNELPDDRKTVRPLRRLVEERIAGFQNALAAVKRERELASIRVINLAVLARDMHKLTVNLDHEVRTVQSGEVATWAGSLVAACEAHIADGVFDLGAIEALRQRLLVLKERARDIAFSMDFSFLFRPERRLLSIGYRVNANELDEACYDLLASEARLTSLFAIAKGDLPTEHWYKLGRPIVPIGARGALVSWSGSMFEYLMPPLVMQERQGGILNQTNNLVVQEQINHGRRLGTPWGISEAAFNARDHELTYQYTNFGVPTLGLKRGLGQNAVIAPYASILACMYDPKSALANLARLREVGALGAYGYHDAVDFTPTRVPEGQKCAVVRNYYAHHHGMSVAAVANVVFNGQLREWFHADPVIEAAELLLQEKAPRDIPVMAAKREPEALGKGQADLLRPEVRVVEDPINQDRETVLLSNGHYSVMLTATGAGYARWNGQSVTRWTPDPVEDRTGTFIFLRDTVTGDWWSATAEPRRAPGEKTVTRFGDDKAEFVKTVGDLTSEVECIVATEHDAEGRRVILLNTGTEDRFIEVTSYAEPVLAMDDADSSHPTFSKMFLRTEISRHGDVIWVSRNKRSPGDPDIEVAHLVTDNAGSERHTQAETDRRRFLGQGRTLAEAAAFDPGATLSGTDGFTLDPIVSLRRVVRVPAGKKVSVIFWTIAAPDREGVDRAIDRYRHPETFNHELIHAWTRSQVQMRHVGITSKEAASFQMLGRYLVYPDMHLRADAETVKTGLASQSALWPLAISGDFPIFCLRINDDGDLGIAREALRAQEYLRARGITADLVVVNERASSYAQDLQHTLDSMCENLRLRGLSDGPRQHIFAVRRDLMEPETWSTLISASRAVFHARNGTISDQIARATSLYSKSSEKKEEGAEMLLPVIREADARTAVELDGGDLDFWNGFGGFAEDGREYAVRLRGGEATPQPWINVISNEQFGFHVSAEGAAFSWSRNSRDYQLTPWTNDAVVNRPGEAIFVRDMASGAVLTPYAALSRRKSALFETRHGLGYSRFLSTQDELEIEAMHTVHRTLPAKLVRLTIRNRSSAARKLRVYGYAEWVLGNNRSRTAPFVLSEWDESAKTLVATNPYSIDYPGRCAFFASDGDIAGYTASRREFLGRAGGILAPQAVISGAELTGSTDVDGDACAALATDITVEAGVERQVTFFLGDADNPDQVRAVLEELRADSFGAALEAAKAFWGDFTGVVKVETPDRAFNHMINHWLPYQALGCRIMARSAFYQASGAFGFRDQLQDTLAFLIHRPALARAQILNAAARQFVEGDVQHWWLPGTDAGVRTMISDDVVWLAHAVAHYCAVTGEEDILKEKVPFITGPALEEGQHDSFYKPDVADEVGDVYEHCARALDLAIHRTGANGLPLILGGDWNDGMNRVGEAGEGTSVWLGWFLAGTLRAFLPYARARKDKPRVALWERHLEALKDALEQAGWDGDYYRRGYYDDDTPLGSAENGECRIDSIAQSWSTLSGEGDKERSLRAMDAVMAELVDPEKRIVRLFTPPLETTKQDPGYIKAYPPGVRENGGQYTHAATWVVLAFAAQERAEEAWRTFRMLNPVSHALSQVDAEHYRVEPYVVAADIYGEGALAGRGGWTWYTGSAGWLYRAGVEGILGIRKRGDKLLIRPVLPSEWPGYSAEVRVNGTTHRISVSRDSKSGEPVVSVNNSVTKNAHEGVLL', 'MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLWDLLNPKVGGEYVHWVKGSQYCAWWEFAGCLKNVWGANHKGYDAGNAANYLSSQNYQAISVGSGNETGTYSLSGFTNYVGGNLTINLGNSVVLDLSGSNSFTSYQGYNQGKDDVTFTVGAINLNGTLEVGNRVGSGAGTHTGTATLNLNANKVNINSNINAYKTSQVNIGNANSVITIGSVSLSGDVCSSLASVGIGANCSTSGPSYSFKGTTNATNTAFSNASGSFTFEENATFSGAKWNGGTYTFNKEFSATNNTAFSSGSFNFKGVSSFNGTSFSNASYTFDNQATFQNSSFNGGTFTFNNQTNPTNNAQHPQIQNSSFSGNATTLKGFVNFQQAFNNSNHQLTIQNASFNNATFNNTGKITIEKDASFNNTTFNTSVDTNNMSVTGGVTLSGKNDLKNGSTLDFGSSKITLAQGTTFNLTSLGSEKSVTILNSSGGITYSNLLNHAINGLTSALKTNESLSNPQSFAQGLWDIITYNGVTGQLLNENAATSKPTDSSPSKSSTNSTQVYQVGYKIGDTIYKLQETFSHNSIIIQALESGTYTPPPVINGSKFDLSASNYINADMPWYDHKYYIPKSQNFTESGTYYLPSVQIWGSYTNSFKQTFSANGSNLVIGYNSTWTDHNVSSSGTVSFGDTSGSALNGHCGPWPYYQCTGTTNGTYSAYHVYITANLRSGNRIGTGGAANLIFNGVDSINIANATITQHNAGIYSSSMTFSTQSMDNSQNLNGLNSNGKLSVYGTTFTNEAKDGKFIFNAGQAVFENTNFNGGSYQFSGDSLNFSNNNQFNSGSFEISAKNASFNNANFNNSASFNFNNSNATTSFVGDFTNANSNLQIAGNAVFGNSTNGSQNTANFNNTGSVNISGNATFDNVVFNGPTNTSVKGQVTLNNITLKNLNAPLSFGDGTITFNAHSVINIAESITNGNPITLVSSSKEIEYNNAFSKNLWQLINYQGHGASSEKLVSSAGNGVYDVVYSFNNQTYNFQEVFSQNSISIRRLGVNMVFDYVDMEKSDHLYYQNALGFMTYMPNSYNNNLGNANNTIYYYDKSIDFYASGKTLFTKAEFSQTFTGQNSAIVFGAKSIWTSLSDAPQSNTIIRFGDNKGAGSNDASGHCWNLQCIGFITGHYEAQKIYITGSIESGNRISSGGGASLNFNGLQGILLTNATLYNRAAGTQSSSMNFISNSANIQAQNSYFIDDTAQNGGNPNFSFNALNLDFSNSSFRGYVGKTQSVFKFNAKNAISFTNSTNLSSGLYQMQAKSVLFDNSNLSVSVGTSSIKANAINLSQNASINASNHSTLELQGDLNVNDTSSLNLNQSTINVSNNATINDYASLIASNGSHLNFNGAVNFNSANITTSLNNSSIVFKGAVSLGGQFNLSNNSSLDFQGSSAITSNTAFNFYDNAFSQSPITFHQALDIKAPLSLGGNLLNPNNSSVLDLKNSQLVFGDQGSLNIANIDLLSDLNDNKNRVYNIIQADMNSNWYERISFFGMHINDGIYDAKNQTYSFTNPLNNALKITESFKDNQLSVTLSQIPGIKNTLYNIGSEIFNYQKVYNNANGVYSYSDDAQGVFYLTSNVKGYYNPNQSYQASGSNNTTKNNNLTSESSIISQTYNAQGNPISALHIYNKGYNFNNIKALGQMALKLYPEIKKVLGNDFSPSSLNALNSNALNQLTKLITPNDWKNINELIDNANNSVVQNFNNGTLIVGATQIGQTDTNSAVVFGGLGYQTPCDYTDIVCQKFRGTYLGQLLESSSADLGYIDTTFNAKEIYLTGTLGSGNAWGTGGSASVTFNSQTSLILNQANIVSSQTDGIFSMLGQEGINKVFNQAGLANILGEVAVQSINKAGGLGNLIVNTLGSNSVIGGYLTPEQKNQTLSQLLGQNNFDNLMNDSGLNTAIKDLIRQKLGFWTGLVGGLAGLGGIDLQNPEKLIGSMSINDLLSKKGLFNQITGFISANDIGQVISVMLQDIVKPSNALKNDVAALGKQMIGEFLGQDTLNSLESLLQNQQIKSVLDKVLAAKGLGPIYEQGLGDLIPNLGKKGLFAPYGLSQVWQKGDFSFNAQGNVFVQNSTFSNANGGTLSFNAGNSLIFAGNNHIAFTNHAGTLQLLSDQVSNINITTLNASNGLKINAANNNVSVSQGNLFVSASCAQQSDPTTANIANPCALSAQSTNGASSNNASNNAPIALSNNDESLMVAANDFNFSGNIYANGVVDFSKIKGSANIKNLYLYNNAQFQANNLTISNQAVLEKNASFVTNNLNIQGAFNNNATQKIEVLQNLVIASNASLSTGIYGLEVGGALNNSGAIHFNLENTQTPTPLIQAEGIINLNTTQTPFMNVNNSMANNTTYTLLKSSRYIDYNINPNSLQSYLNLYTLININGNHIEEKNGALTYLGQRVLLQDKGLLLSVALPNSNNASQNNILSLSVLYNQVKMSCGDKAMDFTPPTLQDYIVGIQGQSALNQIEAVGGNAIKWLSTLMMETKENPFFAPIYLKNHSLNEILGVTKDLQNTASLISNPNFRDNATNLLELASYTQQTSRLTKLSDFRSREGESDFSLLELKNKRFSDPNPEVFVKYSQLSKHPNNLWVQGVGGASFISGGNGTLYGLNAGYDRLVKNVILGGYVAYGYSDFNGNIMHSLGNNVDVGMYARAFLKRNEFTLSANETYGGNATSINSSNSLLSVLNQRYNYNTWTTSVNGNYGYDFMFKQKSVVLKPQVGLSYHFIGLSGMKGNDAAYKQFLMHSNPSNESVLTLNMGLESRKYFGKNSYYFVTARLGRDLLIKSKGSNTVRFVGENTLLYRKGEVFNTFASVITGGEMHLWRLVYVNAGVGLKMGLQYQDINITGNVGMRVAF', 'MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGATAQAATTESNASAKTEQVVQQNSTSAASDSTSTSNSSAAVSTSSATPVSTESASSMTVSDLPASASAASDNQASAANASESSSQSASSSVASDAAATVSKDSQAASEANSQSAADVETVQLPTSAANANANESQAANILGAQAVQKAANQQAPAGFTVTDPNYPAEMYKDPDASHYTYWWAQSSNGEYNLVLSTDRNGDGKVYVFLLGNNNNVLGKYTVDKNKSTEVATDDEGDFGTVYNDGQSGVFVTSDGTWKSKFNVFDPKAGEDDGDYGSISFMIPQVETQTTTYVTYFDSKGNKVDKPIEVSDPVIQKGLDGQIYTTKGGKVINGYFAKEPKNAHGFMSPFGKQGAIYTKDWHDGLKATFTETDTKTGLMHVVVKHYYHSWGWGTWRTVKEFDLAPGQSEKVDYDVYKSVTIHSIYIPQTINIQYTYEKLGNLVISSDSKSFPAEDKTQYPNDKSDSTKAGNVTIPKVAGFTPTINDKTVTNYTFNPSDYVSDLSKDINVVYVADTQEAAISFYDETDHKPLNDQTIQLTGKTGEKISHTEANQTLAKLGKQGYVVDQNTFADDATYDNDTQAPQEFTIYLKHDTTHTDATSSKADQKTVSETIHYVYKDGVNANKPVADDANTTVTFKRGYTTDKVTGKIVSYDPWTVDGKQADSKTFDAVKSPVIAGYTADQAEVAAQTVTPDSQNINKTVYYTADTQEAAINFYDETGHKLLDNQTIHLTGKTGEKVDRTQADQTLADLVKQGYVLDKENTAKAFPADAVYDNNDQTPQEFTIYLKHGTTHTDATSSKADQKTVSETIHYVYKDGVNANKPVADDANTTVTFKRGYTTDKVTGKIVSYDPWTVDGKQADSKTFDAVKSPVIAGYTADQAEVAAQTVTPDSQNINKTVYYTADTQEAAINFYDETGHKLLDNQTIHLTGKTGEKVDRTQADQTLAELEKQGYVLDENNTKLGFPSNAAYDDDDVKPQEFTIYLKHGMTHTDATDKNAEQKIVTETIHYVYENNQTAKTDYTSAVDFKRGYTTDNVTHKIISYDPWMVSSKKFGFVKSPAIEGYTPNHSQIDEITVTPDSKDVVKTVVYVGNAQEAQAIFYDETTGKEISGTREIATGKTDETISFTKDPNEVVKELEKQGYVFDKDNAKNNVFVAGTAYDKNSEVHQYFKYYLKHGHATVTPDQDPQKGQKTVTQTIKYEYADGTATGLADNVQTLTFKRTGDKDLVTHEVTWPDWSTVAGQQTSVVTSPALKGYTADTNEIPAITYHAGDSDVTYVVKYNADVQHAVINYIDGESDEILHTDKVNGHSDEKINYSTADMIKQLEAKGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKTDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQTAYVKYVDDTTGETLRQDDLHGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKTDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQTAYVKYVDDTTGETLRQDDLHGYTDETIPYSTAEGIKKYEGDGYVLVSDGFKPGTKFGVGTPTYEVHFKHGMTHTDATDKNAEQKTVTETIHYVDENNQTVQPDSTTAVTFKRGYTTDNVTGKVVSYDPWTVDGNQADSKTFAAVPSPAVEGYTPNHQQINEFTVTPDSKDIVKTVVYVGDPQEAQAIFYDETTGKEISNTREIVNGKTDETIGFTKDPNEVVKELEKQGYVFDKDNANNNVFAAGTTYDKNSEVHQYFKYYFTHATTIVTPDNPKTPADVLPDNPGKNYPSGVAKDDLNKTVTRTINITTPDGKTQTITQKAEFTRSATVDEVTGEVTYGPWSKNVVLESVDVPNISGYVPSASVPEITVTPNDQDMTINITYKKLDSGKAADQGGNASNGGQATNGGSTTGQSAQNGQSGQTQNNAGAQQLPQTGNANNEKGALGLASAMFAAGLGLGFGSKKKCHED', 'MSRKERNFKRFFGQEKARVKLYKSGKQWVKAGIREVQLLKVLGLPFLNKDVEQINNLDTNKDKNFKNQAMKATGLAGGAFTFAMLNDHHAYAASETPMTSEIASNSETVANQNSTTVTKSETSTTEYISSQTSTSQDATSSTNSTEKSTSSSTTDSQTSTDSTSDKSTSNSEKQDSSMSNSDTKASSSSTTDNSTSNNSTTSEKDTNSQANTTSTDSQKGSTSTNDNSITSTSTKDNQIRKNSTESNSITASNSTSDSNSGSTVSTNSTTSQLTSTSESQINTDLGSTLLVSDSTSTSTSTAPLKLRTFSRLATTTFAAAAATSTTNTYTGAGTDTNYNIPIYYKLTTVNNGTSMTFTYTVTYDNPATTTVERPTALSNSYAIYNTGTTNQTMFTLGSAYGTPSTATSYITDSTGAQVSNPRANTTNINKQGSGYTWANGYQMNGAQAKQGYGLTTTWTVPINSSGDTSFTFNPYSTSVTGGTNFFNGKKVTVTDPTSTANSQSASTSTANSQSASTSKSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSTSTSTSTSTANSQSTSTSTSTSVSDSTSASTSLSGSTSTSVSDSTSASTSLSDSASTSVSDSTSASTSLSASTSTSESDSTSASTSLSESTSTSLSDSLSASTSLSDSASTSVSDSTSASTSLSGSESASLSDSASASTSLSESTSTSESTSTSESDSTSASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSESTSTSLSDSASASTSLSESTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSESTSTSLSDSASASTSLSESTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSDSASTSTSVSDSTSASTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSESTSTSVSDSTSASTSLSDSASTSVSDSTSASTSLSESTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSASTSTSVSDSTSASTSLSGSTSTSESDSTSMSTSLSGSESTSLSDSLSASTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSTSISTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISRSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISGSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSESTSLSDSASASTSLSASTSTSVSDSTSTSTSDSVSTSTSMSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTNTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSESTSTSVSDSTSTSTSLSGSESTSLSDSASASTSLSASTSTSVSDSTSTSTSDSVSTSTSMSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISGSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSGSTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSSSESTSTSVSDSTSASVSTSISTSISMSESSSTSASTSDSTSTSASTSESRSASHSMSGTDSNNTSSSDSKSHSISNSDSNTTSDSASASTSISDSSSTSTSDSNASHSFSTSHSVSESNSMSTSHSQFDSISTSESMSGTDSTSLSTSLSHSASTSNSTSMTTSESQSNNDSQMHSNSLHHDAKDELPDTGDSDSNSTGLVSAVAAMLAGLGLFGKSRKNKKDKKNKGSEQ', 'MPSRSPSSARSSRALYAPRLKPLAQAIALLLVAGGAQAAGQPFSAAWFAAKGAAQGGAAGAPRPGAQLPGAPPPLAQQQRVNQQLQRSMANLNNTVAAIAAQQAAQAAGRQAALNLPQDVPDGLGEGGLKVDASLPFEQAWQNAKGPVQTQAAGKTTVSIEQTADKAVLNWETFNVGRNTTVDFQQHADWALLNRVNDPSARPSQIQGQIKADGTVMLVNRNGVVFSGSSQVDVRNLTVAAANISDEQFRQRGLYYDNAGSRPTFTDAAGAVRVEQGAQLRTAAPSGSTRGGGYVLLLGSEVDNAGSIVTPKGQTVLSAGDSFVIRRGQGTDGNLTSTTRGNEVLPGFAADSSAGRVRNSGLVQAATGDISLSGREVEQAGVLLSSSSVDSRGTLHLKASERITLAEGATSAILVDSSGSAALDSQREALLKPLNGSSAAVSRGDDDRRDLSRVEIDSAGSVDFRDGSITLASGGQVAVNAGQRALLRDGAVIDVSGAQGVQVAMETNSIKVNVRGNEQRDASVNREGGGLNSQDVWVDVRDLVRVPAGTNGYASDRWYTAGGLLEVGGYLGTQGHSAGEWMAQGGIVSFTGNDVVTQAGSQINLSGGTLDVQGGYIRQTWLKGSDGRLYELGSAPGDLLYDGIYRGYEAHSERWDQTRYFYNPLIAPTQRYENGYSVGRDAGSLVIGSANARLDGQVVGDTYRGERQTEAPQAGLDGYNQSQNAVARGAQLVVGRYTPYYVKSSGLLEYALGADAGSLKQVVIGTGEVAAEEPTLDAPVAAERQGRLSLDSELLNGFQLGGLKVAAGESIRVDSALTLANGGEAILFANDVAIDADITAHGGSLQAGNVLAQISPNGTIDGFVDAGREAGILRVGDGVRLDASGLWSNLLLAPEDNDTLAYRDGGRISLRSGGDLSLGQGSLLDVSSGAALLADGKRLGGRGGDIALHASAGLAQASDGQLQLGGTLNGLGTSGAGTLSLQSGKVRIGGDDLGDGSLQLAEDFFQQGFASYRVVGRSGLTVAEDAQVRVARPVYRFASGAGEVAAGEAPREALEAWIPPLYLEDALAGRLVQREGADLYLQAGGDGNILGQLDPASQTLELGRGSLVEVDPGRAIVLRGPGQITLDGILNAWGGRIDVRQQQFGALDVTQDNQPKAQGQPHARSIWIGEQALLDVAGRAVTALDGRGRRYGEVQSGGSIVIGGEIDPGKAIATSADAFVIVRPGARLEASGSQAQLDVPGLGRVLLAGDGGRIALSSYNGLYLDGSLRAAAGGSGAAGGSLEIIADAPLYQGFTVVDDRVLAMRELILTAGHADSGLPTLLQPGMDDSALRYGQSRVGTQSLTGGGFDQLSLFSNGPLSFEGNIDLAMGRSLNLYAGTIAATGGGPSEVKLQAPYVRLSGIGMYGQQASGEFRPRLTYGPTATAEQVRLQVSAGRLLDIAGRLSFGSDGVINGVNAEAVRYQRPGFEKVTLRSEGDLRFAGDYPENGDPSGRLITHGDLQLTAAQLYPVTGASSTLYAGYGLDEGGQAVFDAERHLAIERSGESLPDTPLSVFGSLAFMASNIEQGGVVRAPLGLIQFGSNLDRAPGTVRLLPGSLTSVSGAELVMPYGGTTDGINYLVNQVPIQLTGAGGALAAGTLVAGVGLYASEVDVQQGARLDLSGGGELAGAGFISGRGGSTDARFHPLVQQDNDGFRLPELSSNPVYAIVPGHQAVSAPLGGEAGAIQPLVGQQVTIGDGVPGLAAGTYTLLPSTYALLPGAFRVEINGLAGQGAPMATQGLRNGSWATSGQLSIAGTSIRDSLSRQVILSSADTLRRYSQYNEMSYADFIRADAARKNIPRAMLPVDARSLYLGLRADEELRENALSFEGKVDFTPEESGYGGSLIVDAEAGIEILPEGGLPDSDFAGVSLVADDLNAIGASRIAIGTLPYVEYGEQGNFVQFGGSNRLFPVVLRKGAHLSAPEVIIGRDITLEGGSGISTLGKGKTAYDSSDGFIYQPGGRNLLLLSNGWLNLLAPAADSSLPVRLGGCAEGAGCADTELYSEGTLGIATNGTVTFGDNVRYGTRNLSLALSTINIGSSQSLADAAARGVLPNGLALDQTVLQRLLRGERGAGIPALENLILSARDAVNIYGSVSLDTYDPATGKSSLANLVLGTPAIYGHGTGEDVASIRTASLVWSGSSQPAAAPVAGGAGSGSGTLRVDAERITLGYGANTQPAGETDEARLALGFAEVQLNASERISANHKGSLRVYQRLDGYVAGEGLRYSGGDLRLSTPLLTGEAGSLSRISSGGSLSLAAPAGAAAVTFDSGTAGLGAELSLSAREIRLDSAVSLPSGKLSLSAEDDLELGDGARIDLAGRKASFNDVDKYSWGGDLLLSSRAGDIRQAAGSLIDLSARNNRGGTLSAVALAEDAGVVDLQGRILGGASGDYDAGGTRVPFLGGELEIRAQRLGDGGSLSEQFTALNQRLNQGEVFGARRFQLKQGDLQIGDGLKAHRIEVSLDNGQLGVSGTVDASGAQVGEIRLAGGRGLSLGGNALLDAHGSLLRRDSYGQIIDSPNRAMVELSSGSGTLVLAGGARIDLRHGTAAPAEQVDGVARGTLELNAPRLGGVSAGDIAIDASGALDIRGAGSIALNAMQRYDDAPWGNDPAAGGRSYQVIDQAYLDARHAESSAFIAAALANRELLDGKLAGLTNATYADAFHLRPGVEIVSATADGDLVVQGDLDLSGYRYASLNPNTPLTEVYGSGEVGALVLRAGGDLNLYGSINDGFAPPPDSPDDKGWILTPGVQPFGGDLVVPGPGVVLGDGTAFLGGRTLNYDLPIKGTTLAAGTRLATEAVLEQPYTLAAGSVLVADIHDAAGTLLYAAGSLLRDGVTLEVGSRLGAGTLLAAPASVQAMTWPAGVPLPSILREGPSRPNVLLLNGELALARGSLIPSQTEVVLAGDAPFIELRPSDGVRQGRNWALAEMLPAGSQSWSMRLVAGADLAAADNRLVRPDSSASLNLADTHYQAKIEQSSGGLVFTDQATDWGITPGTPVDESNEWICGLGPYCAEPPRWTWAPGNYLGMPAGTAIGEGDLWWCSVDPSLCIENLGKTVVTPQNQLFSVLRTGTGDLDLASAGNLTQWSPYGVYTAGTQAADVATGFNQPRGLFNGSVLGAGGADYEVLSTSQYQAWYPEHGGNLDIAVGGDVVGDQWAEKLTSSDPIRPLPPSAAVGNWLWRQGSADREGVPTAWWVNFGSYVRGAEGDAPYLVGFTGFGTLGGGNLSMRTGGDAGNIAPRGDGSIPSSGNLNPRSQGLVLAVAGTGRLTSDGALQLGGGGDLNVRIGGEVNPSREARATQTYSSSGFDGLYSGGTIHDLQGALINLRGSASLYSGALGGIDPRYDTLLRDPAEVRSRDAFSPTLASSTGGLTLVAGDTGMRLETRGDLVLGGVTDPGRVGVPNTVGFTAPDGSVYQGGGIAWFSLWTAHTSIDLFAAGGNLTPSTQLVEATNAIPMAGRNLSPSDGRFIYPSIVRAAAPEGSIYLGPSSGDMGGVSLNVSTTPYSLLLAPSLNGELELLAGDSIYAGGYSVQRSGADPANLPSIWTPAFAGYSDAALLNPIAGNGSPDGNPAVIGGLPLFYFGPDSAASLARDLQPARFYALTGDIVGLNSGAQIRFGEQAGNRAGQTWYEGAGPVWMRAGRDIVASGTPLGQRISAPSQISTDASFTGNLFVHDDPNDLSLVQAGRDILYGNFNVAGPGTLEISAGRNILMEDRAAITSLGAVVPGDSRPGADIVLQAGAAGADYQAFLERYLDPANLAQAGTPLAEQPGKVVRTYESELAKWLNERFGFAGDAEQAQAFFAGLPAEQQRIFARQVYFAELRAGGREYNEVGGVRQGSYLRGRNAIAALFPERDPAGNPISYEGDIVMYGGAGVHTDFGGDIQLLSPGGRQVFGIEGEAPPSTAGIVTQGQGDIQAYSRDSILLGQSRIMTTFGGSILAWSAEGDINAGRGSQTTVVYTPPRRIYDAWGNVSLSPQVPSTGAGIATLNPIPEVAPGDIDLIAPLGTIDAGEAGIRVSGNVNVAALQVVNAANIQTQGQSSGIPLVASVNTGALTSASAAASSATQAAEDVSRQQQAAARQRMPSVITVQVLGFGNERLEPSRDGASRSPGYNPDSAVQVLGAGALGEQARSQLTDEERGNLIL', 'MDIRSPLNQCIALSLAGILFLNPIVAAAAGLALDKAAGGNTGLGQAGNGVPIVNIATPNDAGLSNNHFRDYNVGANGLILNNATGKTQGTQLGGIILGNPNLKGQAAQVILNQVTGGNRSTLAGYTEVAGQSARVIVANPHGITCQGCGFINTPRATLTTGKPIMDGQRLERFQVDGGDIVVEGAELNVGNLEQFDLITRSAKLNAKLYAKNLNIVTGRNDVQADSLQATPRAADGSEKPQLAIDSSALGGMYAGAIRLVGTEQGVGVRLAGDMAASGGDIRIDASGKLSLAQASSQGDLKIAAQAVELNGKTYAGGSAEIRSAEELVNRQSLAARERIVLEAAHIDNAGVIEAGVEPDERRNARGDLELRSGTLRNAGSLVASRALEAKASQALDNQGGSLKGATVRVDAGHLDNRGGKLLAEGELRVEASSLDNRQDGLLQSRDRAVVKTRGDLDNRGGQVIGLNDLEVGAATLDNGQQGLLGSQQSTRVSAQALVNRGDGEVSGKRVEARVGSLDNRGGKLIGDDLLVVASGAIDNRLGLFSAANRLDLRARSLDNSGKGTLSSRGGLEVSLGGLLDNRDEGNLLSQGAQRVTVGQLDNRAGGLLSSRSELNVHGASLDNRGGVLVADAGLSATGGAFDNRDGGSASGKAGVRVEVASLRNDQGGKLLSDGRLDLAANAVGNAGGRIAAKGDLQATLGSLAQQGGELVSEKTLKVAADTLDNSQSGLIAANGGIAIEARQVDNRAGEISSTSKVAVNAREQLDNRGGKVIGDSGLRLTVQRLLNQAKGVLAGRDGLSLDGGELFNGDGGRLDSQNSLSVSLGGVLDNQGGALVSEGSLTARAARLDNRGGTFSSAGALALTSQAALDNQGGRLLSDAGVTLQGASLDNSRSGVISAKGAVDIRTGVLDNSRNGGIGSNAGITLVAARLDNGQQGRVSAKGLLDANLKGLDQRGGGVLISETGVTLDLNGGTLVNRDGGLIATPGALLLRQLGAVDNGAGGEISSDRAFTLAAASLDNRGGRLIGAANLTLRIAQALDNSLAGVISGAAGLDIAAARLDNSAKGTLASRAGIDLRVDGALDNHAEGTVSGARLTLASASLDNSGKGLLSGNAGLSVATGALDNAEGGQLISQGVLDVSSADLDNRGGALSGKQSLRLSAANLDNRGGLLTSDGELELTAGRVDSADGGEISARGDLRLTVERLVQRQGRLVGERGVSLDLRGGDLDNQGGLISARGPLSIERLSVLDNRQGGEISSQQGFELLARRIDNGQQGRIISAGKLRLDADALGNAGAGLLSGWQGLTVTGGSLDNSAGGTLSSKDGELAISLGGALDNHGQGALVSKGAQRIDAASLDNAQGIVSGESDVTLSIAGKLDNGQGGLVSAQRALSFERDDTLLNNAGGRINGGSLLLKGASLDNSDGQLISQGRLDAILGGALVNTGAARLASGGDLLLRSASVDNRGGKLVSQGLLEISAGSLDNSASGTLASQAGMSLRLGGGALRNQQDGLIFSQAGALDVQAGSLDNRQGTLQAQGDNRLRIGGALDNQGGRLDSRAGNLDLQSGSLDNGAGGVLNSAKGWLKLVTGLFDNSAGVTQAQSLEIRAGQGVRNQQGHLSALGGDNRIVTADFDNQGGGLYASGLLSLDGQRFLNQGAAAGQGGKVGAGRIDFSLAGALANRFGQLESESELHLRAAAIDNSGGSLRALGRSGSTRLVAGGLNNAYGVLESANQDLDLQLGSLANAGGRILHTGNGTFGLDSGQVIRAGGELTTNGLLDIRASEWTNSSVLQAGRLNLDIGTFRQTAEGKLLAVQSFTGRGGDWSNDGLLASDGSFRLDLSGGYRGNGRATSLGDFALNAASLDLGNAASLAGGANVTLGAGNLLVNRGRITAAGDLVASAASLNNYGTLGGGGNLRLNAPALLNERGLLFSGADMTLRAGDITNLYGDVYSLGRLDIARDDAGNRAASLRNLSGVIESGKDFSLRASLIENRRAVLESKSGLYTAKMEQTACIEGVNAGDCSGKRNAIWTITQRDKTEVTASSAMGQLLAGGDFAIDGGTLNNLSSLIGSGGNLTANLEVLDNQGLETGELETIRVLRTARGGDIGGIDQKSRNFTNLYWYQSANFDPARAGEIPAALNAILSDWSFEYEFPSKGPTPISSGDQSYAAVIQAAGDVTVNASTRIDNGVTRPGYTFVGSGRQVGDSAVGGSGVSVVVPLTSQLPPDLARRQVNPVTLPGFSLPQGDNGLFRLSSRFAEDGNGSAALGAGADRTQGGSGVSVGQQGAGNAAGTWQGQGVRVDGLAGAANVQGQGGSTLGGSLPGVARVQGVPGNATPSASHKYLIETNPALTELKQFLNSDYLLSGLGMNPDDSKKRLGDGLYEQRLIRDAVVARTGQRYIDGLSSDEALFRYLMDNAIAYKDQLHLQLGVGLSAEQMAALTHDIVWLEEVEVNGEKVLAPVVYLAQAEGRLAPNGALIQGRDVKLVSGGDLHNVGTLRARNDLSATADNLDNSGLIEAGKRLDLLAGDSIRNRQGGVIAGRDVSLTALTGDVINERSVTRYDSALDGRTWERSFADSAARVEAANSLNVQAGRDIANLGGVLQSRGDLSLDAGRDVTVAAVEDRQGQTRWSTSRLQSVTQLGAEVSAGRDLNVSAGRDLTAVASTLEARRDIALSAGRDVTLAAAANEEHAYSKTRKVTYQEDKVAQQGTRVDAGGDLAINAGQDLRLIASQASAGDEAYLVAGDKLELLAANDSNYYLYDKKKKGDFGRKETRRDEVTDVKAVGSQISSGGDLTLLSGGDQTYQGAKLESGNDLAIVSGGAVTFEAVKDLHQESHEKSKGDLAWNSAKGKGQTDETLRQTQIVAQGNLAIKAVEGLKIDLKHIDQKTVSQTIDAMVQADPQLAWLKEAEQRGDVDWRMVQEVHDSWKYSNSGMGPATQIAVAIAAAAIGGMAAAGALSGAGVGASSFAMGAGVGAAGSLSGTAAVSLINNKGDLGKVLKDSFSSDSLKQIAIASLTGGLTAEYFDGILQTKTDPLTGKVTVDLSSLSGVGRFAANQAMQNATSTVLSQALGQGGSLNEALKSALYNSFAAAGFNFVGDIGQEYSLKPGDPSMVTMHALMGGLAAQVSGGDFATGAAAAGANEALVAKLDQAFKSLSPENREAMVTMGSQLVGVLAAAVRDPDVTGKALESAAWVAKNSTQYNFLNHQDVADLDNALQKCKSQGNCRQVEEEFKARSDENRRRLNGCVAVGNCAEIRAEIDAGSTALNELVARQETANPGGSDSDIAYGFLMGRNVVDWTTAGQLHLEQTANLWWNGNPQWQKEVGAYLDQTGFNPFGIGVPAMGGAAGKVTAKALMNALKAGELPKGEVAPGKANLPTIGALADAEAGMPYTHPVKLAAKATGTAGKIKIEAGAIPDANEVRAGQGLSGLGYDVTHQTTASAKGIQGQRTADLHVDGLGSIDVYTPKNLDPTKIVRAIEKKSNQAGGVLVQADLPSTDMSSIAARMWGKTNAQSIKTIFFQKPDGSLVRFDRPAGGG', 'MDIRSPLNQCIALSLAGILFLNPIVAAAAGLALDKAAGGNTGLGQAGNGVPIVNIATPNGAGLSNNHFRDYNVGANGLILNNATGKTQGTQLGGIILGNPNLKGQAAQVILNQVTGGNRSTLAGYTEVAGQSARVIVANPHGITCQGCGFINTPRATLTTGKPIMDGQRLERFQVDGGDIVVEGAELNVGNLEQFDLITRSAKLNAKLYAKNLNIVTGRNDVQADSLQATPRAADGSEKPQLAIDSSALGGMYAGAIRLVGTEQGVGVKLAGDMAASGGDIRIDASGKLSLAQASSQGDLKIAAQAVELNGKTYAGGSAEIRSAEELVNRQSLAARERIALEAAHIDNAGVIEAGVEPDERRNARGDLELRSGTLRNAGSLVASRALEAKASQALDNQGGSLKGATVRVDGGHLDNRGGKLLAEGELRVEASSLDNRQDGLLQSRDRAVVKTRGDLDNRGGQVVGLNELQVQAAALDNRSAGLLSSKGDMDIEFARLDNSAGGKLVSERRTLLKADRLDNRSGRIVAGQDLDLSSRLIDNRAGDISSTSRVVASAREQLDNRGGKIVGDSGLDITTPRMLNQDKGVLASRDGLRLSATELFNGAGGLLSSQKGIDVSLAGAFDNQAGSLDSRGFLTVKSAWLDNQGGTLSSAGALAVTSQGALNNQGGRLASDAGLSLSSASLDNSQAGAISGKGAVEIRTGNLNNSRKASIGSDAGLTLVAARVDNSQAGRIAAKGVIDADLQGLDQHDRGNLVSDTGITLDLNKGSLVNRAQGLIATPGTLLLRQLGVVDNSGGEISSDRAFTLATSALNNQGGRLLSGGALTLRIAQALDNSLEGIVSGAGGLDIQAFVLDNRSGSIGSKGAIDIGVTRLENDAGTLIAERGLKLVADEANSSKGRIAANGSLHAKVGTLSQKGGELTSQDSLTLDLGILNNNAGRIAGNQGVDITARQVDNSVGEIASQGVVALNLTEQLDNRGGKIVGDSGLGITAPHVLNQDKGVLASRDGLRLSATELFNGAGGLLSSQKGIDVSLAGAFDNQAGSLDSRGFLTVKSAWLDNQGGTLSSAGALAVTSQGALNNQGGRLASDAGLSLSSASLDNSQAGAISGKGAVEIRTGNLNNSRKASIGSDAGLTLVAARVDNSQAGRIAAKGAIDAALQGLDQHDRGSLVSDTGITLDLNKGSLVNRAQGLIATPGTLLLRQLGVVDNSGGEISSDRAFTLATSALNNQGGRLLSGGALTLRIAQALDNSLEGIVSGAGGLDIQAFVLDNRSGSIGSKGAIDIGVTRLENDAGTLIAERGLKLAADEANNSKGRIVAKDELRAKLGALVQNGGELTTQGALALDADKVDNGAGRIAGNRGVVIDARQVDNRAGEIASQGVATLNLTEQLDNRGGKVVADSGLGITAPRVLNQDKGVIASRDGLRLSGTELFNGNAGLLSSQRHIEVTLDGVLDNQGKGALLSDGTLTVSAGRIHNQDATLSSAGALRLSSQEAVDNRGGKLVTDSSLRLTSASLDNSRSGIISANAAAEIHTGVLNNSQKGNLGSNDGLGLIATEVDNSQEGRITAKGMIDANIKGLDQQGKGRLVSNAGIILDLNEGTLANGAQGLIATPGTLLLRQLGMVDNSGGEISSDRAFTLTTSALTNQGGRLRSGGVLTLRIAQALDNSLEGVLSGTGGLDIRALALDNRSGSIGSKGAVDIDVSRLENDDGDLLSEGRLKLTAERANSVRGRIAARGDLHASVTAFNQAGGELSSEGALMLEADSLDNRSGGLVSADGNLTVSARRIDNRAGEIASPGQVTLDVAEQLDNRGGKAIGDSGLRLAAPRVLNQDGGVLASRDGLRLNGAELFNGNGGLLSSQQSIDVILDGVLGNQAGSLSSQGRLSVKSGRLDNQGGAVSSAGTLSLSSQGALNNQGGRVVTDAGAVLRSASLDNSQGGIVSAKGAAEIRTGSLNNSQKGGIGSGAGLALVADLVDNSQNGRITAKGAIDANLKGLDQQGSGRLVSDTAIALDLRGGELVNRAQGLIATPGALLLRQLGVVDNSGGGEISSDRSFTLAATALSNRGGRVISGDSLTLRIAQALDNSLQGVLSASGGLDVAALVFDNHSGIVASKGDTHIGVNRLENEAGRVVSEGALDLTAKQVSSAKGRIAAKGDLQVTVGTLEQQGGELASQGTLTLDADSLDNRNGGLVSADGGVTAEARQIDNRGGEISSVAKVALAVREQLDNRGGKVIGDSELSLTVQRLLNQAKGVLASRDGLHLDGAELLNGDGGLLSSQRLVDVTLSGALDNQGSGALVSEESLTVKADQVNNQAGTFSSAGSLLVTSRGELNNQGGRLVTDAGATLNSTGFDNSRAGLVSAKGAVAIRTGALNNSQKGSIGGNTGVTLVAGLVDNGREGRISTKGTLDANLKGLLQQGGGSLVGERGVTLDLNGGTLDNHDLGLVSTPGALLLRQLGMVDNSVGGEISSDRAFTLAANTLNNQGGRLISSEALTLRIAKTLDNSLKGQVLATDGLAIESQVLDNRAGTIGSKGDARISVTSLDNAEQGSLVSEGRLELVADQVSNGNQGRIAARGVLEAAVGTLLQQGGELVSQGSLDLRADTLDNSQSGLIAANGGIAIEARQVDNRAGEISSTSKVAVNAREQLDNRGGKVIGDSGLRLTVQRLLNQAKGVLAGRDGLSLDGGELFNGDGGRLDSQNSLSVSLGGVLDNQGGALVSEGSLTARAARLDNRGGTFSSAGALALTSQAVLDNQGGRLLSDAGVTLKGASLDNSRSGVISAKGAVDIRTGVLDNSRNGGIGSNAGITLVAARLDNGQQGRVSAKGLLDANLKGLDQRGGGVLVSETGVTLDLNGGTLVNRDGGLIATPGALLLRQLGAVDNGAGGEISSDRAFTLAAASLDNRGGRLIGADSLTLRIAQALDNSLAGVISGAAGLDIAAARLDNSAKGTLASRAGIDLRVDGALDNHAEGTVSGARLTLASASLDNSGKGLLSGNAGLSVATGALDNAEGGQLISQGVLDVSSADLDNRGGALSGKQSLRLSAANLDNRGGLLTSDGELELTAGRVDSADGGEISARGDLRLTVERLVQRQGRLIGERGVSLDLRGGDLDNQGGLISARGPLSIERLNVLDNRQGGEIYSQQGFELLARRIDNGQQGRIISAGKLRLDADALGNAGAGLLSGWQGLTVTGGSLDNSAGGTLSSKDGELAISLGGALDNHGQGALVSKGAQRIDAASLDNAQGIVSGESDVTLSIAGKLDNGQGGLVSAQRALSFERDDTLLNNAGGRINGGSLLLKGASLDNSDGQLISQGRLDAILGGALVNAGAARLASGGDLLLRSASVDNRGGKLVSQGLLEISAGSLDNSASGTLASQADMSLRLGGGALRNQQDGLIFSQAGALEVQAGSLDNRQGTLQAQGDNRLRIGGALDNQAGRLDSRAGNLDLQSGSLDNGAGGVLNSAKGWLKLVTGLFDNSAGVTQAQSLEIRAGQGVRNQQGHLSALGGDNRIVTADFDNQGGGLYASGLLSLDGQRFLNQGAAAGQGGKVGAGRIDFSLAGALANRFGQLESESELHLRAAAIDNSGGSLRALGRSGSTRLVAGDLNNAYGVLESANQDLDLQLGSLANAGGRILHTGNGTFGLDSGQVIRAGGELTTNGLLDIRASEWTNSSVLQAGRLNLDIGTFRQTAEGKLLAVQSFTGRGGDWSNDGLLASNGSLRLELSGGYRGNGRATSLGDFALNAASLDLGNAASLAGGANVTLGAGNLLVNRGRITAAGDLVASAASLNNYGTLGGGGNLRLNAPALLNERGLLFSGADMTLRAGDITNLYGDVYSLGRLDIARDDAGGWANRLENISGNLESTGDMRFSVSSLLNRRETLEIEGDLQNSAIGVRCTGCQLSERWGKTRSSSELVWIREYKSTLGDSSAAASITAGRDLLVVGASLQNIASNISAVRDATLSLSNFENKGYALGEYAVRGVYSPPSKFGEELLMRILAYNAVNDPSYGEGYASTGGRLPNIHYFDKNFNEKVSPLEVIHGNGKNGGPGWHLYFGTLDVEYPDTDRWNKAIGRIPAPNYSSKKTDAIPDLLKGLAPLDELTINKGANSTVGAVVQAGGRVTVNAAESFNNSVLQGFQAVQETQLPHQDIAVSSTTSAVVTLKSQLPADLARQQINPLTLPGFSLPQGQNGLFRLASQGAQVNQASGALKSASDLTQSGHGVSVSAQTGSGASGWSTQARRVGDDRVTSLAGSAYQGRVAEAIDALRASAPISGDGGNTGRFQAGEHQATTGLGGLVEGNASGHSGNGVILADLRGGLPSFSSLPASDHVQGTVPGHDGNGTILANWQGAQATVQASPSTVRVEGVVSSPGGNGSILADLPAEQSSVQALPSAVRAQGSLPRLEERSALLAEPPVGQPALQTLPSVARVEGVPSNATPSNSHKYLIETNPALTELKQFLNSDYLLGGLGINPDDSKKRLGDGLYEQRLVREAIVQRTGQRFIAGLNSDEAMFRYLMDNAIASKDVLGLTPGVTLSAAQVAALTHDIVWLEEVEVNGEKVLAPVVYLAQAEGRLGPNGALIQGRDVNLITGGDLRNAGTLRAQNDLSATAGNIDNSGLIEAGNRLDLLASGSIRNDQGGIIAGREVSLSALTGDVINERTVTQHQSSYRGTGTTEAFADSAARIEAAQKLTVSAGRDVANIGGVIDSKGDLALQGGRDVLVSAAVAERGWTAGSQAYQTQTTQMGAEVVAGRDISVSAGRDISVVGSRIDARRDVTFEAGRDVGLVAAANEEHAYGKTKKVTFQDDKITQQATRVDAGGDLAINAGQDLRLVASQASAGDEAYLVAGDKLELLAANDSSYYLYDKKSKGSFGSKKTRRDEITDVTAVGSQISSGGDLTLLSGGDQTYQGAKLESGNDLAIVSGGAVTFEAVKDLHQESHEKSKGDLAWQSSKGKGQTDETVRQSQIVAQGNLAIKAVEGLKIDLKHIDQKTVSQTIDAMVQADPQLAWLKQMEQRGDVDWRRVQELHDSWKYSNSGLGVGAQLAIAIVVAYFTAGAASAALGSMAGVGAGSGSMMAAAGSTAMVQAGTAVGTAAAGWANAAGTAVAMGMASNGAISTINNRGNLGDVVKDVTSSDALRGYVVAGTTAGLTAGVYDKWTSTQTGTSTALPNTGAVAPAAGLGTWQGVGQFTSNQLLQNGTSVLLDRALGGKGSLGDALQNSLANAFAAYGFKLIGDTTHGVLDDGSLGKIGLHALMGGLAAEAVGGDFRTGALAAGVNEALVDSLAKQYASLPIDDKKGLLIMSSQLIGVLAASTQGDADAKSLQTGAWVAGNATQHNYLSHWQEEKKRQEVDGCKDKQLCKTGIEAKWAIISAQQDVGIVVGVGGGIGLSTAETAVGVYELVKNWRETYAALEQLATSPEFRQQFGDNYLKGLEERAAFLTQAYEDAGWQGSVTAGVEGGRFAAELVGVLTAVKGGAQITAKLPTAAKNLVNAIAESPVSGSMSSQLGAVGDLGRLGGGGKGYVDILSHEAKQHILYGDKPGSGGHLWPGQAGKTVFPQNWSADKIVHEVGDIATSPSTKWYAQTGTGGVYTSKGDPAKWVAYEVRDGVRMRVVYQPATGKVITAFPDNAPIPPYKPIK']\n",
|
| 421 |
+
"['CAC14227', 'P12255', 'P20471', 'A64556', 'AAF25576', 'Q4L9P0', 'Q9I5N6', 'Q9I791', 'Q9I120']\n"
|
| 422 |
+
]
|
| 423 |
+
}
|
| 424 |
+
],
|
| 425 |
+
"source": [
|
| 426 |
+
"sequences = list(cpu_sequences['sequence'])\n",
|
| 427 |
+
"print(sequences)\n",
|
| 428 |
+
"accession = list(cpu_sequences['id'])\n",
|
| 429 |
+
"print(accession)"
|
| 430 |
+
]
|
| 431 |
+
},
|
| 432 |
+
{
|
| 433 |
+
"cell_type": "code",
|
| 434 |
+
"execution_count": 8,
|
| 435 |
+
"id": "2a1832cb",
|
| 436 |
+
"metadata": {},
|
| 437 |
+
"outputs": [
|
| 438 |
+
{
|
| 439 |
+
"data": {
|
| 440 |
+
"application/vnd.jupyter.widget-view+json": {
|
| 441 |
+
"model_id": "5df74f5eb4e24f72b645d0bbc1dc5c36",
|
| 442 |
+
"version_major": 2,
|
| 443 |
+
"version_minor": 0
|
| 444 |
+
},
|
| 445 |
+
"text/plain": [
|
| 446 |
+
"Processing Sequences: 0%| | 0/9 [00:00<?, ?it/s]"
|
| 447 |
+
]
|
| 448 |
+
},
|
| 449 |
+
"metadata": {},
|
| 450 |
+
"output_type": "display_data"
|
| 451 |
+
}
|
| 452 |
+
],
|
| 453 |
+
"source": [
|
| 454 |
+
"# Setup device\n",
|
| 455 |
+
"device = torch.device('cpu')\n",
|
| 456 |
+
"\n",
|
| 457 |
+
"# Load tokenizer and model\n",
|
| 458 |
+
"tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)\n",
|
| 459 |
+
"model = T5EncoderModel.from_pretrained(\"Rostlab/ProstT5\").to(device)\n",
|
| 460 |
+
"model.full() if device == 'cpu' else model.half()\n",
|
| 461 |
+
"\n",
|
| 462 |
+
"# Clean sequences\n",
|
| 463 |
+
"sequences = [\" \".join(list(re.sub(r\"[UZOB]\", \"X\", s))) for s in sequences]\n",
|
| 464 |
+
"sequences = [ \"<AA2fold> \" + s for s in sequences]\n",
|
| 465 |
+
"\n",
|
| 466 |
+
"# Process each sequence individually\n",
|
| 467 |
+
"for i, (seq, acc_id) in enumerate(tqdm(zip(sequences, accession), total=len(sequences), desc=\"Processing Sequences\")):\n",
|
| 468 |
+
" try:\n",
|
| 469 |
+
" # Tokenize\n",
|
| 470 |
+
" ids = tokenizer(\n",
|
| 471 |
+
" seq,\n",
|
| 472 |
+
" add_special_tokens=True,\n",
|
| 473 |
+
" return_tensors='pt'\n",
|
| 474 |
+
" ).to(device)\n",
|
| 475 |
+
"\n",
|
| 476 |
+
" # Forward pass\n",
|
| 477 |
+
" with torch.no_grad():\n",
|
| 478 |
+
" embedding_repr = model(\n",
|
| 479 |
+
" ids.input_ids,\n",
|
| 480 |
+
" attention_mask=ids.attention_mask\n",
|
| 481 |
+
" )\n",
|
| 482 |
+
"\n",
|
| 483 |
+
" # Compute actual length (excluding prefix)\n",
|
| 484 |
+
" real_len = ids.attention_mask[0].sum().item() - 1\n",
|
| 485 |
+
"\n",
|
| 486 |
+
" # Extract and average embeddings\n",
|
| 487 |
+
" emb = embedding_repr.last_hidden_state[0, 1:real_len]\n",
|
| 488 |
+
" emb_avg = emb.mean(dim=0).cpu().numpy()\n",
|
| 489 |
+
"\n",
|
| 490 |
+
" # Save embedding using accession ID\n",
|
| 491 |
+
" np.save(os.path.join(path, f\"{acc_id}.npy\"), emb_avg)\n",
|
| 492 |
+
"\n",
|
| 493 |
+
"\n",
|
| 494 |
+
" # Cleanup\n",
|
| 495 |
+
" del ids, embedding_repr, emb, emb_avg\n",
|
| 496 |
+
" torch.cuda.empty_cache()\n",
|
| 497 |
+
" gc.collect()\n",
|
| 498 |
+
"\n",
|
| 499 |
+
" except RuntimeError as e:\n",
|
| 500 |
+
" print(f\"Error {e} mientras se procesaba {acc_id}\")\n",
|
| 501 |
+
"\n"
|
| 502 |
+
]
|
| 503 |
+
}
|
| 504 |
+
],
|
| 505 |
+
"metadata": {
|
| 506 |
+
"kernelspec": {
|
| 507 |
+
"display_name": "tesisEnv",
|
| 508 |
+
"language": "python",
|
| 509 |
+
"name": "python3"
|
| 510 |
+
},
|
| 511 |
+
"language_info": {
|
| 512 |
+
"codemirror_mode": {
|
| 513 |
+
"name": "ipython",
|
| 514 |
+
"version": 3
|
| 515 |
+
},
|
| 516 |
+
"file_extension": ".py",
|
| 517 |
+
"mimetype": "text/x-python",
|
| 518 |
+
"name": "python",
|
| 519 |
+
"nbconvert_exporter": "python",
|
| 520 |
+
"pygments_lexer": "ipython3",
|
| 521 |
+
"version": "3.10.16"
|
| 522 |
+
}
|
| 523 |
+
},
|
| 524 |
+
"nbformat": 4,
|
| 525 |
+
"nbformat_minor": 5
|
| 526 |
+
}
|
notebooks/__pycache__/my_utils.cpython-310.pyc
ADDED
|
Binary file (14.2 kB). View file
|
|
|
notebooks/hyperparamsRF.ipynb
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/my_utils.py
ADDED
|
@@ -0,0 +1,607 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#Libraries
|
| 2 |
+
import os
|
| 3 |
+
import re
|
| 4 |
+
from pprint import pprint
|
| 5 |
+
from io import StringIO
|
| 6 |
+
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 7 |
+
from urllib.error import HTTPError
|
| 8 |
+
from typing import Literal
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
import pandas as pd
|
| 12 |
+
import numpy as np
|
| 13 |
+
|
| 14 |
+
from sklearn.ensemble import RandomForestClassifier
|
| 15 |
+
from sklearn import svm
|
| 16 |
+
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
|
| 17 |
+
from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
|
| 18 |
+
from sklearn.decomposition import PCA
|
| 19 |
+
from sklearn.preprocessing import StandardScaler
|
| 20 |
+
from sklearn.pipeline import Pipeline
|
| 21 |
+
from sklearn.manifold import TSNE
|
| 22 |
+
from sklearn.model_selection import train_test_split
|
| 23 |
+
from sklearn.utils import resample
|
| 24 |
+
|
| 25 |
+
import umap
|
| 26 |
+
|
| 27 |
+
import requests
|
| 28 |
+
from Bio import Entrez
|
| 29 |
+
from Bio import SeqIO
|
| 30 |
+
from tqdm.notebook import tqdm
|
| 31 |
+
|
| 32 |
+
# Visualization libraries
|
| 33 |
+
import seaborn as sns
|
| 34 |
+
import matplotlib.pyplot as plt
|
| 35 |
+
import plotly.express as px
|
| 36 |
+
|
| 37 |
+
from esm.models.esmc import ESMC
|
| 38 |
+
from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
|
| 39 |
+
from transformers import T5Tokenizer, T5EncoderModel
|
| 40 |
+
|
| 41 |
+
import torch
|
| 42 |
+
import gc
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# Load one chunk of embeddings
|
| 47 |
+
def load_emb(path: str, acc: list[str])->list[np.array]:
|
| 48 |
+
X = []
|
| 49 |
+
for a in tqdm(acc, desc = 'Cargando embeddings'):
|
| 50 |
+
emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
|
| 51 |
+
if len(emb.shape) == 3:
|
| 52 |
+
emb = emb.squeeze(axis = 0)
|
| 53 |
+
emb = emb.mean(axis = 0)
|
| 54 |
+
X.append(emb)
|
| 55 |
+
elif len(emb.shape) == 2:
|
| 56 |
+
emb = emb.mean(axis = 0)
|
| 57 |
+
X.append(emb)
|
| 58 |
+
else:
|
| 59 |
+
X.append(emb)
|
| 60 |
+
return X
|
| 61 |
+
|
| 62 |
+
def confusion(title : str, y_true: np.array, y_pred: np.array) -> None:
|
| 63 |
+
|
| 64 |
+
cm = confusion_matrix(y_true = y_true,
|
| 65 |
+
y_pred = y_pred,
|
| 66 |
+
normalize = 'pred')
|
| 67 |
+
|
| 68 |
+
class_names = np.unique(y_true)
|
| 69 |
+
plt.figure(figsize=(6, 4))
|
| 70 |
+
sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
|
| 71 |
+
xticklabels=class_names, yticklabels=class_names)
|
| 72 |
+
|
| 73 |
+
plt.xlabel('Predicted Label')
|
| 74 |
+
plt.ylabel('True Label')
|
| 75 |
+
plt.title(f'Confusion Matrix - {title}')
|
| 76 |
+
plt.tight_layout()
|
| 77 |
+
plt.show()
|
| 78 |
+
|
| 79 |
+
def perplexity(X):
|
| 80 |
+
X_array = np.vstack(X)
|
| 81 |
+
perp= np.arange(5, 55, 5)
|
| 82 |
+
divergence = []
|
| 83 |
+
|
| 84 |
+
for i in perp:
|
| 85 |
+
model = TSNE(n_components=2, init="pca", perplexity=i)
|
| 86 |
+
reduced = model.fit_transform(X_array)
|
| 87 |
+
divergence.append(model.kl_divergence_)
|
| 88 |
+
fig = px.line(x=perp, y=divergence, markers=True)
|
| 89 |
+
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
|
| 90 |
+
fig.update_traces(line_color="red", line_width=1)
|
| 91 |
+
fig.show()
|
| 92 |
+
|
| 93 |
+
def plot_umap(X: list[np.array], y: list[str], title: str, org : list[str]) -> None:
|
| 94 |
+
reducer = umap.UMAP(n_neighbors=30, random_state=42)
|
| 95 |
+
X_array = np.vstack(X)
|
| 96 |
+
|
| 97 |
+
scaled_X = StandardScaler().fit_transform(X_array)
|
| 98 |
+
embedding = reducer.fit_transform(scaled_X)
|
| 99 |
+
|
| 100 |
+
fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=y, hover_data= [org, y])
|
| 101 |
+
fig.update_layout(
|
| 102 |
+
title=title,
|
| 103 |
+
xaxis_title="First UMAP",
|
| 104 |
+
yaxis_title="Second UMAP",
|
| 105 |
+
)
|
| 106 |
+
fig.show()
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def plot_PCA(X: np.array, labels: list[str], title: str, org : list[str], scale: bool) -> None:
|
| 110 |
+
X_array = np.vstack(X)
|
| 111 |
+
pca = PCA(n_components=2, random_state=42)
|
| 112 |
+
|
| 113 |
+
if scale:
|
| 114 |
+
pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
|
| 115 |
+
Xt = pipe.fit_transform(X_array)
|
| 116 |
+
explained = pipe.named_steps['pca'].explained_variance_ratio_
|
| 117 |
+
else:
|
| 118 |
+
Xt = pca.fit_transform(X_array)
|
| 119 |
+
explained = pca.explained_variance_ratio_
|
| 120 |
+
|
| 121 |
+
df_plot = pd.DataFrame({
|
| 122 |
+
'PC1': Xt[:, 0],
|
| 123 |
+
'PC2': Xt[:, 1],
|
| 124 |
+
'Label': labels
|
| 125 |
+
})
|
| 126 |
+
|
| 127 |
+
fig = px.scatter(df_plot, x='PC1', y='PC2', color='Label', hover_data= [org, labels])
|
| 128 |
+
fig.update_layout(
|
| 129 |
+
title=title,
|
| 130 |
+
xaxis_title=f'PC1 ({explained[0]*100:.1f}%)',
|
| 131 |
+
yaxis_title=f'PC2 ({explained[1]*100:.1f}%)'
|
| 132 |
+
)
|
| 133 |
+
fig.show()
|
| 134 |
+
|
| 135 |
+
|
| 136 |
+
def tsne_plot(X, y, org : list[str]) -> None:
|
| 137 |
+
X_array = np.vstack(StandardScaler().fit_transform(X))
|
| 138 |
+
tsne = TSNE(n_components=2, perplexity=60, random_state=42)
|
| 139 |
+
tsne_fit = tsne.fit_transform(X_array)
|
| 140 |
+
|
| 141 |
+
fig = px.scatter(x=tsne_fit[:, 0], y=tsne_fit[:, 1], color=y, hover_data= [org, y])
|
| 142 |
+
fig.update_layout(
|
| 143 |
+
title="t-SNE",
|
| 144 |
+
xaxis_title="First t-SNE",
|
| 145 |
+
yaxis_title="Second t-SNE",
|
| 146 |
+
)
|
| 147 |
+
|
| 148 |
+
fig.show()
|
| 149 |
+
|
| 150 |
+
|
| 151 |
+
def plot_emb(X, y, model_name, org : list[str]):
|
| 152 |
+
print(f"Plotting embeddings for: {model_name}")
|
| 153 |
+
plot_PCA(X, y, title="PCA", scale=True, org = org)
|
| 154 |
+
tsne_plot(X, y,org = org)
|
| 155 |
+
plot_umap(X, y, title="UMAP",org = org)
|
| 156 |
+
|
| 157 |
+
|
| 158 |
+
def evaluate(model, X_test, y_test):
|
| 159 |
+
|
| 160 |
+
result = {}
|
| 161 |
+
y_pred = model.predict(X_test)
|
| 162 |
+
|
| 163 |
+
result['Accuracy'] = accuracy_score(y_test, y_pred)
|
| 164 |
+
result['Recall'] = recall_score(y_test, y_pred, average = 'weighted')
|
| 165 |
+
result['Precision'] = precision_score(y_test, y_pred, average='weighted')
|
| 166 |
+
result['F1'] = f1_score(y_test, y_pred, average='weighted')
|
| 167 |
+
|
| 168 |
+
pprint(result)
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
return result
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
|
| 175 |
+
def train_rf(title : str, X : np.ndarray, y : np.ndarray, params: dict) -> tuple[RandomForestClassifier, dict]:
|
| 176 |
+
|
| 177 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
|
| 178 |
+
|
| 179 |
+
# Initialize the RandomForestClassifier with specified parameters
|
| 180 |
+
classifier: RandomForestClassifier = RandomForestClassifier(**params)
|
| 181 |
+
|
| 182 |
+
# Fit the model on training data
|
| 183 |
+
classifier.fit(X_train, y_train)
|
| 184 |
+
|
| 185 |
+
# Make predictions on the test data
|
| 186 |
+
y_pred = classifier.predict(X_test)
|
| 187 |
+
|
| 188 |
+
evaluation = evaluate(classifier, X_test, y_test)
|
| 189 |
+
|
| 190 |
+
print(classification_report(y_test, y_pred, zero_division=0))
|
| 191 |
+
|
| 192 |
+
confusion(title = title,
|
| 193 |
+
y_true = y_test,
|
| 194 |
+
y_pred = y_pred)
|
| 195 |
+
|
| 196 |
+
del X_train, X_test, y_train, y_test
|
| 197 |
+
|
| 198 |
+
return classifier, evaluation
|
| 199 |
+
|
| 200 |
+
def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[Pipeline, dict]:
|
| 201 |
+
|
| 202 |
+
X_train, X_test, y_train, y_test = train_test_split(
|
| 203 |
+
X, y, test_size=0.33, stratify=y, random_state=42
|
| 204 |
+
)
|
| 205 |
+
|
| 206 |
+
svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
|
| 207 |
+
pipeline = Pipeline([
|
| 208 |
+
('scaler', StandardScaler()),
|
| 209 |
+
('svm', svm.SVC(**svc_params))
|
| 210 |
+
])
|
| 211 |
+
|
| 212 |
+
|
| 213 |
+
pipeline.fit(X_train, y_train)
|
| 214 |
+
|
| 215 |
+
y_pred = pipeline.predict(X_test)
|
| 216 |
+
|
| 217 |
+
evaluation = evaluate(model=pipeline, X_test=X_test, y_test=y_test)
|
| 218 |
+
|
| 219 |
+
confusion(title = title,
|
| 220 |
+
y_true = y_test,
|
| 221 |
+
y_pred = y_pred)
|
| 222 |
+
|
| 223 |
+
print(classification_report(y_test, y_pred, zero_division=0))
|
| 224 |
+
|
| 225 |
+
return pipeline, evaluation
|
| 226 |
+
|
| 227 |
+
|
| 228 |
+
def randomSVM(X: np.array, y = np.array) -> dict:
|
| 229 |
+
|
| 230 |
+
X_train, _, y_train, _ = train_test_split(X,
|
| 231 |
+
y,
|
| 232 |
+
test_size=0.33,
|
| 233 |
+
stratify=y,
|
| 234 |
+
random_state=42)
|
| 235 |
+
|
| 236 |
+
X_sample, y_sample = resample(X_train,
|
| 237 |
+
y_train,
|
| 238 |
+
n_samples = 3500,
|
| 239 |
+
stratify = y_train,
|
| 240 |
+
random_state = 42)
|
| 241 |
+
|
| 242 |
+
pipeline = Pipeline([('scaler', StandardScaler()),
|
| 243 |
+
('svm', svm.SVC())])
|
| 244 |
+
|
| 245 |
+
param_distributions = {
|
| 246 |
+
'svm__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
|
| 247 |
+
'svm__kernel': ['rbf'],
|
| 248 |
+
'svm__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
|
| 249 |
+
'svm__shrinking': [True, False],
|
| 250 |
+
'svm__class_weight': ['balanced'],
|
| 251 |
+
'svm__tol': [1e-5, 1e-4, 1e-3, 1e-2],
|
| 252 |
+
'svm__max_iter': [-1, 1000, 5000, 10000],
|
| 253 |
+
'svm__probability': [False, True],
|
| 254 |
+
'svm__decision_function_shape': ['ovr', 'ovo'],
|
| 255 |
+
'svm__cache_size': [200, 400, 600]
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
random_search = RandomizedSearchCV(
|
| 259 |
+
estimator=pipeline,
|
| 260 |
+
param_distributions=param_distributions,
|
| 261 |
+
n_iter=50,
|
| 262 |
+
scoring='f1_weighted',
|
| 263 |
+
cv=3,
|
| 264 |
+
verbose=2,
|
| 265 |
+
random_state=42,
|
| 266 |
+
n_jobs=-1
|
| 267 |
+
)
|
| 268 |
+
|
| 269 |
+
random_search.fit(X_sample, y_sample)
|
| 270 |
+
|
| 271 |
+
pprint(random_search.best_params_)
|
| 272 |
+
|
| 273 |
+
return random_search.best_params_
|
| 274 |
+
|
| 275 |
+
def randomSearch(X: np.ndarray, y: np.ndarray) -> dict:
|
| 276 |
+
|
| 277 |
+
X_train, _, y_train, _ = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
|
| 278 |
+
classifier : RandomForestClassifier = RandomForestClassifier(random_state=42)
|
| 279 |
+
|
| 280 |
+
X_sample, y_sample = resample(X_train,
|
| 281 |
+
y_train,
|
| 282 |
+
n_samples = 3500,
|
| 283 |
+
stratify = y_train,
|
| 284 |
+
random_state = 42)
|
| 285 |
+
|
| 286 |
+
param_grid = {
|
| 287 |
+
'n_estimators': list(np.arange(500,4000, 400)),
|
| 288 |
+
'max_depth': [None, 10, 20, 30, 40, 50],
|
| 289 |
+
'min_samples_split': [2, 5, 10, 15, 20],
|
| 290 |
+
'min_samples_leaf': [1, 2, 4, 8, 10],
|
| 291 |
+
'max_features': ['sqrt', 'log2', None, 0.3, 0.5, 0.7],
|
| 292 |
+
'bootstrap': [True, False],
|
| 293 |
+
'criterion': ['gini', 'entropy'],
|
| 294 |
+
'max_leaf_nodes': [None, 10, 50, 100, 200],
|
| 295 |
+
'class_weight' : ['balanced']
|
| 296 |
+
}
|
| 297 |
+
|
| 298 |
+
rf_random = RandomizedSearchCV(estimator = classifier,
|
| 299 |
+
param_distributions = param_grid,
|
| 300 |
+
n_iter= 50,
|
| 301 |
+
scoring = 'f1_weighted',
|
| 302 |
+
cv = 3,
|
| 303 |
+
verbose = 2,
|
| 304 |
+
n_jobs = -1)
|
| 305 |
+
|
| 306 |
+
rf_random.fit(X = X_sample, y = y_sample)
|
| 307 |
+
|
| 308 |
+
print('Best Params')
|
| 309 |
+
pprint(rf_random.best_params_)
|
| 310 |
+
|
| 311 |
+
return rf_random.best_params_
|
| 312 |
+
|
| 313 |
+
def gridSearch(X: np.ndarray, y: np.ndarray, grid: dict):
|
| 314 |
+
|
| 315 |
+
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
|
| 316 |
+
|
| 317 |
+
# Initialize GridSearchCV with the base model and hyperparameters
|
| 318 |
+
grid_search: GridSearchCV = GridSearchCV(
|
| 319 |
+
estimator=RandomForestClassifier(random_state=42),
|
| 320 |
+
param_grid=grid,
|
| 321 |
+
cv=1,
|
| 322 |
+
scoring = 'f1_weighted',
|
| 323 |
+
verbose = 1,
|
| 324 |
+
pre_dispatch = 5,
|
| 325 |
+
n_jobs=-1
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
grid_search.fit(X = X_train, y = y_train)
|
| 329 |
+
|
| 330 |
+
print('Best Estimator')
|
| 331 |
+
pprint(grid_search.best_estimator_)
|
| 332 |
+
|
| 333 |
+
evaluation = evaluate(grid_search, X_test, y_test)
|
| 334 |
+
|
| 335 |
+
return grid_search, evaluation
|
| 336 |
+
|
| 337 |
+
|
| 338 |
+
def fetch_uniprot_sequence(uniprot_id: str):
|
| 339 |
+
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
|
| 340 |
+
response = requests.get(url)
|
| 341 |
+
|
| 342 |
+
if response.status_code == 200:
|
| 343 |
+
try:
|
| 344 |
+
# Use SeqIO.read to get a single record
|
| 345 |
+
fasta_io = StringIO(response.text)
|
| 346 |
+
record = SeqIO.read(fasta_io, "fasta")
|
| 347 |
+
return str(record.seq)
|
| 348 |
+
|
| 349 |
+
except Exception:
|
| 350 |
+
# fallback to UniSave if the standard endpoint is not available
|
| 351 |
+
url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
|
| 352 |
+
response = requests.get(url)
|
| 353 |
+
|
| 354 |
+
if response.status_code == 200:
|
| 355 |
+
try:
|
| 356 |
+
# If UniSave returns multiple entries, grab the first
|
| 357 |
+
entries = re.split(r"(?=>)", response.text.strip())
|
| 358 |
+
fasta_io = StringIO(entries[1])
|
| 359 |
+
record = SeqIO.read(fasta_io, "fasta")
|
| 360 |
+
return str(record.seq)
|
| 361 |
+
except Exception:
|
| 362 |
+
print(f'No se pudo obtener la entrada FASTA para {uniprot_id} desde UniSave')
|
| 363 |
+
else:
|
| 364 |
+
print(f'UniSave URL inválido: {url}')
|
| 365 |
+
else:
|
| 366 |
+
print(f'URL inválido o no accesible: {url}')
|
| 367 |
+
|
| 368 |
+
def fetch_refseq_sequence(refseq_id : str):
|
| 369 |
+
"""
|
| 370 |
+
Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
|
| 371 |
+
Returns the raw amino-acid sequence as a string.
|
| 372 |
+
"""
|
| 373 |
+
|
| 374 |
+
Entrez.email = "puglia.jd@gmail.com" # REQUIRED
|
| 375 |
+
Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
|
| 376 |
+
# Check if the ID is NaN or None
|
| 377 |
+
if pd.isna(refseq_id) or refseq_id is None:
|
| 378 |
+
return None
|
| 379 |
+
|
| 380 |
+
try:
|
| 381 |
+
handle = Entrez.efetch(
|
| 382 |
+
db="protein",
|
| 383 |
+
id=refseq_id,
|
| 384 |
+
rettype="fasta",
|
| 385 |
+
retmode="text"
|
| 386 |
+
)
|
| 387 |
+
record = SeqIO.read(handle, "fasta")
|
| 388 |
+
handle.close()
|
| 389 |
+
return str(record.seq)
|
| 390 |
+
except Exception:
|
| 391 |
+
|
| 392 |
+
url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
|
| 393 |
+
response = requests.get(url)
|
| 394 |
+
if response.status_code == 200:
|
| 395 |
+
try:
|
| 396 |
+
fasta_data = response.text
|
| 397 |
+
fasta_io = StringIO(fasta_data)
|
| 398 |
+
record = SeqIO.read(fasta_io, "fasta")
|
| 399 |
+
return str(record.seq)
|
| 400 |
+
except ValueError:
|
| 401 |
+
print(f"No se pudo convertir {fasta_data}, id: {refseq_id}")
|
| 402 |
+
|
| 403 |
+
# Main function to fetch sequences for a DataFrame
|
| 404 |
+
def _fetch_sequence_for_row(idx, row):
|
| 405 |
+
"""
|
| 406 |
+
Helper to fetch sequence for a single row. Returns (idx, sequence).
|
| 407 |
+
"""
|
| 408 |
+
sequence = None
|
| 409 |
+
# Try SwissProt ID
|
| 410 |
+
swiss_id = row.get('SwissProt_ID')
|
| 411 |
+
if swiss_id and not pd.isna(swiss_id):
|
| 412 |
+
try:
|
| 413 |
+
sequence = fetch_uniprot_sequence(swiss_id)
|
| 414 |
+
except HTTPError as e:
|
| 415 |
+
print(f"Warning: SwissProt fetch failed for {swiss_id} with HTTP {e.code}")
|
| 416 |
+
sequence = None
|
| 417 |
+
|
| 418 |
+
# Try RefSeq if no SwissProt
|
| 419 |
+
if not sequence and row.get('Refseq_Accession') and not pd.isna(row['Refseq_Accession']):
|
| 420 |
+
try:
|
| 421 |
+
sequence = fetch_refseq_sequence(row['Refseq_Accession'])
|
| 422 |
+
except HTTPError as e:
|
| 423 |
+
print(f"Warning: RefSeq fetch failed for {row['Refseq_Accession']} with HTTP {e.code}")
|
| 424 |
+
sequence = None
|
| 425 |
+
|
| 426 |
+
# Try Other_Accession if still no sequence
|
| 427 |
+
if not sequence and row.get('Other_Accession') and not pd.isna(row['Other_Accession']):
|
| 428 |
+
try:
|
| 429 |
+
sequence = fetch_refseq_sequence(row['Other_Accession'])
|
| 430 |
+
except HTTPError as e:
|
| 431 |
+
print(f"Warning: RefSeq fetch failed for {row['Other_Accession']} with HTTP {e.code}")
|
| 432 |
+
sequence = None
|
| 433 |
+
|
| 434 |
+
return idx, sequence
|
| 435 |
+
|
| 436 |
+
|
| 437 |
+
def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: int = None, max_workers: int = 5) -> pd.DataFrame:
|
| 438 |
+
"""
|
| 439 |
+
Add a 'sequence' column to the dataframe by fetching sequences from
|
| 440 |
+
SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
|
| 441 |
+
|
| 442 |
+
Args:
|
| 443 |
+
df: Input DataFrame with ID columns.
|
| 444 |
+
batch_size: Optional size of row-chunks to process sequentially.
|
| 445 |
+
max_workers: Number of threads for parallel fetching.
|
| 446 |
+
|
| 447 |
+
Returns:
|
| 448 |
+
DataFrame with added 'sequence' column.
|
| 449 |
+
"""
|
| 450 |
+
result_df = df.copy()
|
| 451 |
+
if 'sequence' not in result_df.columns:
|
| 452 |
+
result_df['sequence'] = None
|
| 453 |
+
|
| 454 |
+
total_rows = len(result_df)
|
| 455 |
+
# Determine batch indices
|
| 456 |
+
if batch_size and batch_size > 0:
|
| 457 |
+
starts = list(range(0, total_rows, batch_size))
|
| 458 |
+
else:
|
| 459 |
+
starts = [0]
|
| 460 |
+
batch_size = total_rows
|
| 461 |
+
|
| 462 |
+
# Overall progress bar
|
| 463 |
+
with tqdm(total=total_rows, desc="Retrieving sequences", unit="row") as pbar:
|
| 464 |
+
for start in starts:
|
| 465 |
+
end = min(start + batch_size, total_rows)
|
| 466 |
+
sub_df = result_df.iloc[start:end]
|
| 467 |
+
futures = []
|
| 468 |
+
# Launch parallel tasks
|
| 469 |
+
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
| 470 |
+
for idx, row in sub_df.iterrows():
|
| 471 |
+
futures.append(executor.submit(_fetch_sequence_for_row, idx, row))
|
| 472 |
+
# Collect results
|
| 473 |
+
for future in as_completed(futures):
|
| 474 |
+
idx, seq = future.result()
|
| 475 |
+
result_df.at[idx, 'sequence'] = seq
|
| 476 |
+
pbar.update(1)
|
| 477 |
+
|
| 478 |
+
print("Sequence retrieval complete")
|
| 479 |
+
success_count = result_df['sequence'].notna().sum()
|
| 480 |
+
print(f"Successfully retrieved {success_count} out of {total_rows} sequences "
|
| 481 |
+
f"({round(success_count/total_rows*100, 2)}%)")
|
| 482 |
+
return result_df
|
| 483 |
+
|
| 484 |
+
def esm_embed_sequence(model : Literal["esmc_300m", "esmc_600m"], sequence : str, device : str) -> None:
|
| 485 |
+
|
| 486 |
+
"""
|
| 487 |
+
Embed a protein sequence using the specified ESM model.
|
| 488 |
+
Args:
|
| 489 |
+
model: Name of the ESM model to use.
|
| 490 |
+
sequence: Protein sequence to embed.
|
| 491 |
+
Returns:
|
| 492 |
+
LogitsOutput: Contains the embeddings and logits for the sequence.
|
| 493 |
+
"""
|
| 494 |
+
|
| 495 |
+
client = ESMC.from_pretrained(model).to(device)
|
| 496 |
+
|
| 497 |
+
protein = ESMProtein(sequence=sequence)
|
| 498 |
+
protein_tensor = client.encode(protein)
|
| 499 |
+
|
| 500 |
+
if isinstance(protein_tensor, ESMProteinError):
|
| 501 |
+
|
| 502 |
+
raise protein_tensor
|
| 503 |
+
|
| 504 |
+
output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))
|
| 505 |
+
|
| 506 |
+
return output
|
| 507 |
+
|
| 508 |
+
def esm_save_emb(model: Literal["esmc_300m", "esmc_600m"],
|
| 509 |
+
seq_list: list[str],
|
| 510 |
+
id_list: list[str],
|
| 511 |
+
path: str,
|
| 512 |
+
device : Literal['cuda', 'cpu'] = 'cuda') -> None:
|
| 513 |
+
|
| 514 |
+
"""
|
| 515 |
+
Save embeddings to disk.
|
| 516 |
+
|
| 517 |
+
Args:
|
| 518 |
+
model: ESM model name. Options are "esmc_300m" or "esmc_600m".
|
| 519 |
+
seq_list: List of protein sequences.
|
| 520 |
+
id_list: List of identifiers corresponding to the sequences.
|
| 521 |
+
path: Directory to save the embeddings.
|
| 522 |
+
"""
|
| 523 |
+
|
| 524 |
+
assert len(seq_list) == len(id_list), "Sequence and ID lists must be the same length."
|
| 525 |
+
os.makedirs(path, exist_ok=True)
|
| 526 |
+
|
| 527 |
+
for i, (seq, acc) in enumerate(tqdm(zip(seq_list, id_list), total=len(seq_list), desc="Saving embeddings")):
|
| 528 |
+
try:
|
| 529 |
+
output: LogitsOutput = esm_embed_sequence(model=model, sequence=seq, device = device)
|
| 530 |
+
emb_array = output.embeddings.cpu().numpy()
|
| 531 |
+
|
| 532 |
+
if len(emb_array.shape) == 3:
|
| 533 |
+
emb_array = emb_array.squeeze(axis=0).mean(axis=0)
|
| 534 |
+
elif len(emb_array.shape) == 2:
|
| 535 |
+
emb_array = emb_array.mean(axis=0)
|
| 536 |
+
|
| 537 |
+
np.save(os.path.join(path, f"{acc}.npy"), emb_array)
|
| 538 |
+
|
| 539 |
+
except ESMProteinError as e:
|
| 540 |
+
print(f"Error processing {acc}: {e}")
|
| 541 |
+
|
| 542 |
+
if i % 100 == 0:
|
| 543 |
+
gc.collect()
|
| 544 |
+
torch.cuda.empty_cache()
|
| 545 |
+
|
| 546 |
+
|
| 547 |
+
def prost_embed_sequence(seq_list: list[str],
|
| 548 |
+
acc_list: list[str],
|
| 549 |
+
path: str,
|
| 550 |
+
device : Literal["cuda:0", "cpu"] = "cuda:0") -> None:
|
| 551 |
+
|
| 552 |
+
"""
|
| 553 |
+
|
| 554 |
+
Embed protein sequences using ProstT5 and save embeddings.
|
| 555 |
+
Args:
|
| 556 |
+
model_name: Name of the ProstT5 model to use.
|
| 557 |
+
seq_list: List of protein sequences to embed.
|
| 558 |
+
acc_list: List of identifiers corresponding to the sequences.
|
| 559 |
+
|
| 560 |
+
"""
|
| 561 |
+
|
| 562 |
+
assert len(seq_list) == len(acc_list), "Sequence and ID lists must match"
|
| 563 |
+
|
| 564 |
+
os.makedirs(path, exist_ok=True)
|
| 565 |
+
|
| 566 |
+
device = torch.device(device)
|
| 567 |
+
|
| 568 |
+
tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5")
|
| 569 |
+
model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
|
| 570 |
+
model = model.full() if device.type == 'cpu' else model.half()
|
| 571 |
+
model.eval()
|
| 572 |
+
|
| 573 |
+
for i, (seq, acc_id) in enumerate(tqdm(zip(seq_list, acc_list), total=len(seq_list), desc="Processing Sequences")):
|
| 574 |
+
try:
|
| 575 |
+
# Tokenize
|
| 576 |
+
ids = tokenizer(
|
| 577 |
+
seq,
|
| 578 |
+
add_special_tokens=True,
|
| 579 |
+
return_tensors='pt'
|
| 580 |
+
).to(device)
|
| 581 |
+
|
| 582 |
+
# Forward pass
|
| 583 |
+
with torch.no_grad():
|
| 584 |
+
embedding_repr = model(
|
| 585 |
+
ids.input_ids,
|
| 586 |
+
attention_mask=ids.attention_mask
|
| 587 |
+
)
|
| 588 |
+
|
| 589 |
+
real_len = ids.attention_mask[0].sum().item() - 1
|
| 590 |
+
if real_len <= 0:
|
| 591 |
+
print(f"Sequence too short after tokenization for {acc_id}")
|
| 592 |
+
continue
|
| 593 |
+
|
| 594 |
+
emb = embedding_repr.last_hidden_state[0, 1:real_len]
|
| 595 |
+
emb_avg = emb.mean(dim=0).cpu().numpy()
|
| 596 |
+
|
| 597 |
+
np.save(os.path.join(path, f"{acc_id}.npy"), emb_avg)
|
| 598 |
+
|
| 599 |
+
del ids, embedding_repr, emb, emb_avg
|
| 600 |
+
|
| 601 |
+
except RuntimeError as e:
|
| 602 |
+
print(f"RuntimeError while processing {acc_id}: {e}")
|
| 603 |
+
|
| 604 |
+
if i % 100 == 0:
|
| 605 |
+
gc.collect()
|
| 606 |
+
torch.cuda.empty_cache()
|
| 607 |
+
|