jpuglia commited on
Commit
b84b185
·
verified ·
1 Parent(s): bcccd7c

Upload folder using huggingface_hub

Browse files
.gitattributes CHANGED
@@ -33,3 +33,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ Plots/Embeddings/PCA_ESM300m.png filter=lfs diff=lfs merge=lfs -text
37
+ Plots/Embeddings/PCA_ESM600m.png filter=lfs diff=lfs merge=lfs -text
38
+ Plots/Embeddings/PCA_ProstT5.png filter=lfs diff=lfs merge=lfs -text
39
+ Plots/Embeddings/UMAP_ESM300m.png filter=lfs diff=lfs merge=lfs -text
40
+ Plots/Embeddings/UMAP_ESM600m.png filter=lfs diff=lfs merge=lfs -text
41
+ Plots/Embeddings/UMAP_ProstT5.png filter=lfs diff=lfs merge=lfs -text
42
+ Plots/Embeddings/t-SNE_ESM300m.png filter=lfs diff=lfs merge=lfs -text
43
+ Plots/Embeddings/t-SNE_ESM600m.png filter=lfs diff=lfs merge=lfs -text
44
+ Plots/Embeddings/t-SNE_ProstT5.png filter=lfs diff=lfs merge=lfs -text
45
+ notebooks/EmbAnalisis.ipynb filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ *.plk
2
+ *.pkl
3
+ *.npy
4
+ *.joblib
Data/ePSORTdb.tsv ADDED
The diff for this file is too large to render. See raw diff
 
Data/trainingData.csv ADDED
The diff for this file is too large to render. See raw diff
 
Envs/environment.yml ADDED
@@ -0,0 +1,208 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ name: tesisEnv
2
+ channels:
3
+ - bioconda
4
+ - conda-forge
5
+ - anaconda
6
+ - https://repo.anaconda.com/pkgs/main
7
+ - https://repo.anaconda.com/pkgs/r
8
+ dependencies:
9
+ - _libgcc_mutex=0.1=main
10
+ - _openmp_mutex=5.1=1_gnu
11
+ - asttokens=3.0.0=py310h06a4308_0
12
+ - blas=1.0=mkl
13
+ - bzip2=1.0.8=h5eee18b_6
14
+ - c-ares=1.19.1=h5eee18b_0
15
+ - ca-certificates=2025.2.25=h06a4308_0
16
+ - comm=0.2.2=pyhd8ed1ab_1
17
+ - cyrus-sasl=2.1.28=h52b45da_1
18
+ - debugpy=1.8.11=py310h6a678d5_0
19
+ - decorator=5.2.1=pyhd8ed1ab_0
20
+ - entrypoints=0.4=py310h06a4308_0
21
+ - exceptiongroup=1.2.2=pyhd8ed1ab_1
22
+ - expat=2.7.1=h6a678d5_0
23
+ - font-ttf-dejavu-sans-mono=2.37=hd3eb1b0_0
24
+ - font-ttf-inconsolata=2.001=hcb22688_0
25
+ - font-ttf-source-code-pro=2.030=hd3eb1b0_0
26
+ - font-ttf-ubuntu=0.83=h8b1ccd4_0
27
+ - fontconfig=2.14.1=h55d465d_3
28
+ - fonts-anaconda=1=h8fa9717_0
29
+ - freetype=2.13.3=h4a9f257_0
30
+ - icu=73.1=h6a678d5_0
31
+ - importlib-metadata=8.5.0=py310h06a4308_0
32
+ - intel-openmp=2023.1.0=hdb19cb5_46306
33
+ - ipykernel=6.29.5=py310h06a4308_1
34
+ - ipython=8.33.0=pyh907856f_0
35
+ - ipywidgets=8.1.5=py310h06a4308_0
36
+ - jedi=0.19.2=py310h06a4308_0
37
+ - jpeg=9e=h5eee18b_3
38
+ - jsonschema=4.23.0=py310h06a4308_0
39
+ - jsonschema-specifications=2023.7.1=py310h06a4308_0
40
+ - jupyter_client=7.3.4=py310h06a4308_0
41
+ - jupyter_core=5.7.2=py310h06a4308_0
42
+ - jupyterlab_widgets=3.0.13=py310h06a4308_0
43
+ - kaleido-core=0.2.1=h7c8854e_0
44
+ - krb5=1.20.1=h143b758_1
45
+ - ld_impl_linux-64=2.40=h12ee557_0
46
+ - libabseil=20250127.0=cxx17_h6a678d5_0
47
+ - libcups=2.4.2=h2d74bed_1
48
+ - libcurl=8.12.1=hc9e6f67_0
49
+ - libedit=3.1.20230828=h5eee18b_0
50
+ - libev=4.33=h7f8727e_1
51
+ - libffi=3.4.4=h6a678d5_1
52
+ - libgcc-ng=11.2.0=h1234567_1
53
+ - libglib=2.78.4=hdc74915_0
54
+ - libgomp=11.2.0=h1234567_1
55
+ - libiconv=1.16=h5eee18b_3
56
+ - libnghttp2=1.57.0=h2d74bed_0
57
+ - libpng=1.6.39=h5eee18b_0
58
+ - libpq=17.4=hdbd6064_0
59
+ - libprotobuf=5.29.3=hc99497a_0
60
+ - libsodium=1.0.18=h7b6447c_0
61
+ - libssh2=1.11.1=h251f7ec_0
62
+ - libstdcxx-ng=11.2.0=h1234567_1
63
+ - libuuid=1.41.5=h5eee18b_0
64
+ - libxcb=1.15=h7f8727e_0
65
+ - libxkbcommon=1.0.1=h097e994_2
66
+ - libxml2=2.13.7=hfdd30dd_0
67
+ - lz4-c=1.9.4=h6a678d5_1
68
+ - mathjax=2.7.5=h06a4308_0
69
+ - matplotlib-inline=0.1.7=pyhd8ed1ab_1
70
+ - mkl=2023.1.0=h213fc3f_46344
71
+ - mkl-service=2.4.0=py310h5eee18b_2
72
+ - mkl_fft=1.3.11=py310h5eee18b_0
73
+ - mkl_random=1.2.8=py310h1128e8f_0
74
+ - mysql=8.4.0=h721767e_2
75
+ - narwhals=1.31.0=py310h06a4308_1
76
+ - nbformat=5.10.4=py310h06a4308_0
77
+ - ncurses=6.4=h6a678d5_0
78
+ - nest-asyncio=1.6.0=py310h06a4308_0
79
+ - nspr=4.35=h6a678d5_0
80
+ - nss=3.89.1=h6a678d5_0
81
+ - numpy=1.26.4=py310h5f9d8c6_0
82
+ - numpy-base=1.26.4=py310hb5e798b_0
83
+ - openldap=2.6.4=h42fbc30_0
84
+ - openssl=3.0.16=h5eee18b_0
85
+ - packaging=24.2=py310h06a4308_0
86
+ - parso=0.8.4=py310h06a4308_0
87
+ - pcre2=10.42=hebb0a14_1
88
+ - pexpect=4.9.0=pyhd8ed1ab_1
89
+ - pickleshare=0.7.5=pyhd3eb1b0_1003
90
+ - pip=25.0=py310h06a4308_0
91
+ - platformdirs=4.3.6=pyhd8ed1ab_1
92
+ - plotly=6.0.1=py310he3bba80_0
93
+ - prompt-toolkit=3.0.50=pyha770c72_0
94
+ - psutil=5.9.1=py310h5764c6d_0
95
+ - ptyprocess=0.7.0=pyhd3eb1b0_2
96
+ - pure_eval=0.2.3=pyhd8ed1ab_1
97
+ - pyfaidx=0.8.1.3=pyhdfd78af_0
98
+ - pygments=2.19.1=py310h06a4308_0
99
+ - pyqt=6.7.1=py310h6a678d5_1
100
+ - pyqt6-sip=13.9.1=py310h5eee18b_1
101
+ - python=3.10.16=he870216_1
102
+ - python-dateutil=2.9.0.post0=py310h06a4308_1
103
+ - python-fastjsonschema=2.20.0=py310h06a4308_0
104
+ - python-kaleido=0.2.1=py310h06a4308_0
105
+ - python_abi=3.10=2_cp310
106
+ - pyvcf3=1.0.3=pyhdfd78af_0
107
+ - pyzmq=26.2.0=py310h6a678d5_0
108
+ - qtbase=6.7.3=hdaa5aa8_0
109
+ - qtdeclarative=6.7.3=h6a678d5_0
110
+ - qtsvg=6.7.3=he621ea3_0
111
+ - qttools=6.7.3=h80c7b02_0
112
+ - qtwebchannel=6.7.3=h6a678d5_0
113
+ - qtwebsockets=6.7.3=h6a678d5_0
114
+ - readline=8.2=h5eee18b_0
115
+ - referencing=0.30.2=py310h06a4308_0
116
+ - rpds-py=0.22.3=py310h4aa5aa6_0
117
+ - setuptools=75.8.0=py310h06a4308_0
118
+ - sip=6.10.0=py310h6a678d5_0
119
+ - six=1.17.0=py310h06a4308_0
120
+ - sqlite=3.45.3=h5eee18b_0
121
+ - stack_data=0.6.3=pyhd8ed1ab_1
122
+ - tbb=2021.8.0=hdb19cb5_0
123
+ - tk=8.6.14=h39e8969_0
124
+ - tomli=2.0.1=py310h06a4308_0
125
+ - tornado=6.1=py310h7f8727e_0
126
+ - traitlets=5.14.3=py310h06a4308_0
127
+ - typing_extensions=4.12.2=py310h06a4308_0
128
+ - wcwidth=0.2.13=pyhd8ed1ab_1
129
+ - wheel=0.45.1=py310h06a4308_0
130
+ - widgetsnbextension=4.0.13=py310h06a4308_0
131
+ - xcb-util-cursor=0.1.4=h5eee18b_0
132
+ - xz=5.6.4=h5eee18b_1
133
+ - zeromq=4.3.5=h6a678d5_0
134
+ - zipp=3.21.0=py310h06a4308_0
135
+ - zlib=1.2.13=h5eee18b_1
136
+ - pip:
137
+ - attrs==25.1.0
138
+ - biopython==1.85
139
+ - biotite==0.41.2
140
+ - brotli==1.1.0
141
+ - certifi==2025.1.31
142
+ - charset-normalizer==3.4.1
143
+ - cloudpathlib==0.20.0
144
+ - contourpy==1.3.1
145
+ - cycler==0.12.1
146
+ - dna-features-viewer==3.1.4
147
+ - einops==0.8.1
148
+ - esm==3.1.4
149
+ - executing==2.2.0
150
+ - filelock==3.17.0
151
+ - fonttools==4.56.0
152
+ - fsspec==2025.2.0
153
+ - graphql-core==3.2.6
154
+ - graphviz==0.20.3
155
+ - huggingface-hub==0.29.1
156
+ - idna==3.10
157
+ - jinja2==3.1.5
158
+ - joblib==1.4.2
159
+ - kiwisolver==1.4.8
160
+ - markupsafe==3.0.2
161
+ - matplotlib==3.10.1
162
+ - mpmath==1.3.0
163
+ - msgpack==1.1.0
164
+ - msgpack-numpy==0.4.8
165
+ - networkx==3.4.2
166
+ - nvidia-cublas-cu12==12.4.5.8
167
+ - nvidia-cuda-cupti-cu12==12.4.127
168
+ - nvidia-cuda-nvrtc-cu12==12.4.127
169
+ - nvidia-cuda-runtime-cu12==12.4.127
170
+ - nvidia-cudnn-cu12==9.1.0.70
171
+ - nvidia-cufft-cu12==11.2.1.3
172
+ - nvidia-curand-cu12==10.3.5.147
173
+ - nvidia-cusolver-cu12==11.6.1.9
174
+ - nvidia-cusparse-cu12==12.3.1.170
175
+ - nvidia-cusparselt-cu12==0.6.2
176
+ - nvidia-nccl-cu12==2.21.5
177
+ - nvidia-nvjitlink-cu12==12.4.127
178
+ - nvidia-nvtx-cu12==12.4.127
179
+ - pandas==2.2.3
180
+ - pillow==11.1.0
181
+ - protobuf==6.31.0
182
+ - py3dmol==2.4.2
183
+ - pyparsing==3.2.1
184
+ - pytz==2025.1
185
+ - pyyaml==6.0.2
186
+ - rcsb-api==1.1.3
187
+ - regex==2024.11.6
188
+ - requests==2.32.3
189
+ - rustworkx==0.16.0
190
+ - safetensors==0.5.3
191
+ - scikit-learn==1.6.1
192
+ - scipy==1.15.2
193
+ - sentencepiece==0.2.0
194
+ - sympy==1.13.1
195
+ - tenacity==9.0.0
196
+ - threadpoolctl==3.5.0
197
+ - tokenizers==0.20.3
198
+ - torch==2.6.0
199
+ - torchtext==0.18.0
200
+ - torchvision==0.21.0
201
+ - tqdm==4.67.1
202
+ - transformers==4.46.3
203
+ - triton==3.2.0
204
+ - tzdata==2025.1
205
+ - uniprot-id-mapper==1.1.4
206
+ - urllib3==2.3.0
207
+ - zstd==1.5.6.5
208
+ prefix: /home/jpuglia/miniconda3/envs/tesisEnv
Envs/requirements.txt ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ asttokens @ file:///home/conda/feedstock_root/build_artifacts/asttokens_1733250440834/work
2
+ attrs @ file:///croot/attrs_1734533101012/work
3
+ biopython @ file:///home/builder/ci_310/biopython_1640788437968/work
4
+ biotite==0.41.2
5
+ Brotli==1.1.0
6
+ certifi==2025.1.31
7
+ charset-normalizer==3.4.1
8
+ cloudpathlib==0.20.0
9
+ comm @ file:///home/conda/feedstock_root/build_artifacts/comm_1733502965406/work
10
+ contourpy==1.3.1
11
+ cycler==0.12.1
12
+ debugpy @ file:///croot/debugpy_1736267418885/work
13
+ decorator @ file:///home/conda/feedstock_root/build_artifacts/decorator_1740384970518/work
14
+ dna_features_viewer==3.1.4
15
+ einops==0.8.1
16
+ entrypoints @ file:///home/conda/feedstock_root/build_artifacts/entrypoints_1733327148154/work
17
+ esm==3.1.4
18
+ exceptiongroup @ file:///home/conda/feedstock_root/build_artifacts/exceptiongroup_1733208806608/work
19
+ executing @ file:///home/conda/feedstock_root/build_artifacts/executing_1733569351617/work
20
+ fastjsonschema @ file:///croot/python-fastjsonschema_1731939362158/work
21
+ filelock==3.17.0
22
+ fonttools==4.56.0
23
+ fsspec==2025.2.0
24
+ graphviz==0.20.3
25
+ huggingface-hub==0.29.1
26
+ idna==3.10
27
+ importlib_metadata @ file:///croot/importlib_metadata-suite_1732633488278/work
28
+ ipykernel @ file:///home/conda/feedstock_root/build_artifacts/ipykernel_1719845459717/work
29
+ ipython @ file:///home/conda/feedstock_root/build_artifacts/bld/rattler-build_ipython_1740856895/work
30
+ ipywidgets @ file:///home/conda/feedstock_root/build_artifacts/ipywidgets_1733493556527/work
31
+ jedi @ file:///home/conda/feedstock_root/build_artifacts/jedi_1733300866624/work
32
+ Jinja2==3.1.5
33
+ joblib==1.4.2
34
+ jsonschema @ file:///croot/jsonschema_1728486696720/work
35
+ jsonschema-specifications @ file:///croot/jsonschema-specifications_1699032386549/work
36
+ jupyter-client @ file:///home/conda/feedstock_root/build_artifacts/jupyter_client_1654730843242/work
37
+ jupyter_core @ file:///home/conda/feedstock_root/build_artifacts/jupyter_core_1727163409502/work
38
+ jupyterlab_widgets @ file:///home/conda/feedstock_root/build_artifacts/jupyterlab_widgets_1733428046021/work
39
+ kaleido @ file:///home/conda/feedstock_root/build_artifacts/python-kaleido_1615204619408/work
40
+ kiwisolver==1.4.8
41
+ MarkupSafe==3.0.2
42
+ matplotlib==3.10.1
43
+ matplotlib-inline @ file:///home/conda/feedstock_root/build_artifacts/matplotlib-inline_1733416936468/work
44
+ mkl-service==2.4.0
45
+ mkl_fft @ file:///io/mkl313/mkl_fft_1730824109137/work
46
+ mkl_random @ file:///io/mkl313/mkl_random_1730823916628/work
47
+ mpmath==1.3.0
48
+ msgpack==1.1.0
49
+ msgpack-numpy==0.4.8
50
+ narwhals @ file:///croot/narwhals_1742845957875/work
51
+ nbformat @ file:///croot/nbformat_1728049424075/work
52
+ nest_asyncio @ file:///home/conda/feedstock_root/build_artifacts/nest-asyncio_1733325553580/work
53
+ networkx==3.4.2
54
+ numpy @ file:///croot/numpy_and_numpy_base_1708638617955/work/dist/numpy-1.26.4-cp310-cp310-linux_x86_64.whl#sha256=d8cd837ed43e87f77e6efaa08e8de927ca030a1c9c5d04624432d6fb9a74a5ee
55
+ nvidia-cublas-cu12==12.4.5.8
56
+ nvidia-cuda-cupti-cu12==12.4.127
57
+ nvidia-cuda-nvrtc-cu12==12.4.127
58
+ nvidia-cuda-runtime-cu12==12.4.127
59
+ nvidia-cudnn-cu12==9.1.0.70
60
+ nvidia-cufft-cu12==11.2.1.3
61
+ nvidia-curand-cu12==10.3.5.147
62
+ nvidia-cusolver-cu12==11.6.1.9
63
+ nvidia-cusparse-cu12==12.3.1.170
64
+ nvidia-cusparselt-cu12==0.6.2
65
+ nvidia-nccl-cu12==2.21.5
66
+ nvidia-nvjitlink-cu12==12.4.127
67
+ nvidia-nvtx-cu12==12.4.127
68
+ packaging @ file:///home/conda/feedstock_root/build_artifacts/packaging_1733203243479/work
69
+ pandas==2.2.3
70
+ parso @ file:///home/conda/feedstock_root/build_artifacts/parso_1733271261340/work
71
+ pexpect @ file:///home/conda/feedstock_root/build_artifacts/pexpect_1733301927746/work
72
+ pickleshare @ file:///home/conda/feedstock_root/build_artifacts/pickleshare_1733327343728/work
73
+ pillow==11.1.0
74
+ platformdirs @ file:///home/conda/feedstock_root/build_artifacts/platformdirs_1733232627818/work
75
+ plotly @ file:///home/conda/feedstock_root/build_artifacts/plotly_1742240435426/work
76
+ prompt_toolkit @ file:///home/conda/feedstock_root/build_artifacts/prompt-toolkit_1737453357274/work
77
+ psutil @ file:///home/conda/feedstock_root/build_artifacts/psutil_1653089181607/work
78
+ ptyprocess @ file:///home/conda/feedstock_root/build_artifacts/ptyprocess_1733302279685/work/dist/ptyprocess-0.7.0-py2.py3-none-any.whl#sha256=92c32ff62b5fd8cf325bec5ab90d7be3d2a8ca8c8a3813ff487a8d2002630d1f
79
+ pure_eval @ file:///home/conda/feedstock_root/build_artifacts/pure_eval_1733569405015/work
80
+ py3Dmol==2.4.2
81
+ pyfaidx @ file:///opt/conda/conda-bld/pyfaidx_1728570107633/work
82
+ Pygments @ file:///home/conda/feedstock_root/build_artifacts/pygments_1736243443484/work
83
+ pyparsing==3.2.1
84
+ PyQt6==6.7.1
85
+ PyQt6_sip @ file:///croot/pyqt-split_1744804475988/work/pyqt_sip
86
+ python-dateutil @ file:///home/conda/feedstock_root/build_artifacts/python-dateutil_1733215673016/work
87
+ pytz==2025.1
88
+ PyVCF3 @ file:///opt/conda/conda-bld/pyvcf3_1650931562118/work
89
+ PyYAML==6.0.2
90
+ pyzmq @ file:///croot/pyzmq_1734687138743/work
91
+ referencing @ file:///croot/referencing_1699012038513/work
92
+ regex==2024.11.6
93
+ requests==2.32.3
94
+ rpds-py @ file:///croot/rpds-py_1736541261634/work
95
+ safetensors==0.5.3
96
+ scikit-learn==1.6.1
97
+ scipy==1.15.2
98
+ sip @ file:///croot/sip_1738856193618/work
99
+ six @ file:///home/conda/feedstock_root/build_artifacts/six_1733380938961/work
100
+ stack_data @ file:///home/conda/feedstock_root/build_artifacts/stack_data_1733569443808/work
101
+ sympy==1.13.1
102
+ tenacity==9.0.0
103
+ threadpoolctl==3.5.0
104
+ tokenizers==0.20.3
105
+ tomli @ file:///opt/conda/conda-bld/tomli_1657175507142/work
106
+ torch==2.6.0
107
+ torchtext==0.18.0
108
+ torchvision==0.21.0
109
+ tornado @ file:///home/conda/feedstock_root/build_artifacts/tornado_1648827254365/work
110
+ tqdm==4.67.1
111
+ traitlets @ file:///home/conda/feedstock_root/build_artifacts/traitlets_1733367359838/work
112
+ transformers==4.46.3
113
+ triton==3.2.0
114
+ typing_extensions @ file:///home/conda/feedstock_root/build_artifacts/typing_extensions_1733188668063/work
115
+ tzdata==2025.1
116
+ uniprot-id-mapper==1.1.4
117
+ urllib3==2.3.0
118
+ wcwidth @ file:///home/conda/feedstock_root/build_artifacts/wcwidth_1733231326287/work
119
+ widgetsnbextension @ file:///home/conda/feedstock_root/build_artifacts/widgetsnbextension_1733128559935/work
120
+ zipp @ file:///croot/zipp_1732630741423/work
121
+ zstd==1.5.6.5
Plots/ConfusionMatrix/CM_ESM600m.png ADDED
Plots/ConfusionMatrix/CM_RF_ESM300m.png ADDED
Plots/ConfusionMatrix/CM_RF_ProstT5.png ADDED
Plots/ConfusionMatrix/CM_SVM_ESM300m.png ADDED
Plots/ConfusionMatrix/CM_SVM_ESM600m.png ADDED
Plots/ConfusionMatrix/CM_SVM_ProstT5.png ADDED
Plots/Embeddings/PCA_ESM300m.png ADDED

Git LFS Details

  • SHA256: 5829c7fa9d5c4dbf5f9d063b2d6f2eda461515fe058333a9993425ee14c8ddc7
  • Pointer size: 131 Bytes
  • Size of remote file: 353 kB
Plots/Embeddings/PCA_ESM600m.png ADDED

Git LFS Details

  • SHA256: dfc444490d5dcd0ce78cbf0adaf36704b7627f34f11ace6875b1d611df019b1f
  • Pointer size: 131 Bytes
  • Size of remote file: 304 kB
Plots/Embeddings/PCA_ProstT5.png ADDED

Git LFS Details

  • SHA256: 9007656731995d9ddc15aa041fa3240c556a012e72bfec26407ef010705d34a4
  • Pointer size: 131 Bytes
  • Size of remote file: 333 kB
Plots/Embeddings/UMAP_ESM300m.png ADDED

Git LFS Details

  • SHA256: 1ac5eb50417887ffcab711918d9f28d29c7a0a009fbc9f7aab99061c9e01dbb7
  • Pointer size: 131 Bytes
  • Size of remote file: 128 kB
Plots/Embeddings/UMAP_ESM600m.png ADDED

Git LFS Details

  • SHA256: 65ed1e0ce20d144390261ec444dafde2dacb939f70b1d28e3bb1d9f49722bc13
  • Pointer size: 131 Bytes
  • Size of remote file: 133 kB
Plots/Embeddings/UMAP_ProstT5.png ADDED

Git LFS Details

  • SHA256: b687037c86a9c65d54ef03b674b3812996dfb97d723e55177b3a4864be2947e2
  • Pointer size: 131 Bytes
  • Size of remote file: 113 kB
Plots/Embeddings/t-SNE_ESM300m.png ADDED

Git LFS Details

  • SHA256: f0dd6ce7c94934160b871fe80740dd8809110789ae2b979df84b1a6245464a5f
  • Pointer size: 131 Bytes
  • Size of remote file: 284 kB
Plots/Embeddings/t-SNE_ESM600m.png ADDED

Git LFS Details

  • SHA256: d4b40e9f023cec35097d7190ce319252eb508aa9fd4828669b9e498a758039a7
  • Pointer size: 131 Bytes
  • Size of remote file: 283 kB
Plots/Embeddings/t-SNE_ProstT5.png ADDED

Git LFS Details

  • SHA256: 26c4f88d0cdba816ae3f09aaab1e886d50dc10339d9052b155a9d3cbfdd97c7a
  • Pointer size: 131 Bytes
  • Size of remote file: 217 kB
Plots/ModelEvaluations/RFevaluacion.png ADDED
Plots/ModelEvaluations/SVMevaluacion.png ADDED
Plots/TaxDistributionPSORT.svg ADDED
ProteinLocationPredictor/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
ProteinLocationPredictor/README.md ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ ---
2
+ license: mit
3
+ ---
RepoStructure.txt ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ProteinSubcellularLocPredictor/
2
+
3
+ ├── Data/ # Raw and processed datasets
4
+ │ ├── raw/ # Raw, unaltered data
5
+ │ ├── processed/ # Cleaned or feature-engineered data
6
+ │ └── README.md # Explain data sources and formats
7
+
8
+ ├── Notebooks/ # Jupyter notebooks for EDA, training, etc.
9
+ │ ├── 01_eda.ipynb
10
+ │ ├── 02_preprocessing.ipynb
11
+ │ ├── 03_training.ipynb
12
+ │ └── 04_evaluation.ipynb
13
+
14
+ ├── Deployment/ # Code for using the trained model
15
+ │ ├── predictor.py # Main script to load model and predict
16
+ │ ├── api.py # Optional: REST API using Flask/FastAPI
17
+ │ └── cli.py # Optional: Command-line interface
18
+
19
+ ├── src/ # Python modules shared between notebooks & deployment
20
+ │ ├── __init__.py
21
+ │ ├── preprocessing.py # Feature engineering, tokenization, etc.
22
+ │ ├── model.py # Model creation/training/loading
23
+ │ ├── utils.py # Helper functions
24
+ │ └── config.py # Paths, constants, and config values
25
+
26
+ ├── .gitignore # Ignore datasets, checkpoints, virtual envs, etc.
27
+ ├── requirements.txt # Python package dependencies
28
+ ├── README.md # Project overview, setup, usage
29
+ └── LICENSE # Your preferred open-source license (e.g., MIT)
notebooks/EDA_Psort.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/ESMC_300m.ipynb ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "c409c4ad",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from esm.models.esmc import ESMC\n",
11
+ "from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput\n",
12
+ "from esm.sdk.forge import ESM3ForgeInferenceClient\n",
13
+ "import pandas as pd\n",
14
+ "import os\n",
15
+ "from concurrent.futures import ProcessPoolExecutor, as_completed\n",
16
+ "from tqdm import tqdm\n",
17
+ "import numpy as np\n",
18
+ "import os\n",
19
+ "import torch\n",
20
+ "import gc"
21
+ ]
22
+ },
23
+ {
24
+ "cell_type": "code",
25
+ "execution_count": 4,
26
+ "id": "7f8f916c",
27
+ "metadata": {},
28
+ "outputs": [
29
+ {
30
+ "data": {
31
+ "text/html": [
32
+ "<div>\n",
33
+ "<style scoped>\n",
34
+ " .dataframe tbody tr th:only-of-type {\n",
35
+ " vertical-align: middle;\n",
36
+ " }\n",
37
+ "\n",
38
+ " .dataframe tbody tr th {\n",
39
+ " vertical-align: top;\n",
40
+ " }\n",
41
+ "\n",
42
+ " .dataframe thead th {\n",
43
+ " text-align: right;\n",
44
+ " }\n",
45
+ "</style>\n",
46
+ "<table border=\"1\" class=\"dataframe\">\n",
47
+ " <thead>\n",
48
+ " <tr style=\"text-align: right;\">\n",
49
+ " <th></th>\n",
50
+ " <th>SwissProt_ID</th>\n",
51
+ " <th>Refseq_Accession</th>\n",
52
+ " <th>Other_Accession</th>\n",
53
+ " <th>GramStain</th>\n",
54
+ " <th>Experimental_Localization</th>\n",
55
+ " <th>Phylum</th>\n",
56
+ " <th>Class</th>\n",
57
+ " <th>Organism</th>\n",
58
+ " <th>sequence</th>\n",
59
+ " </tr>\n",
60
+ " </thead>\n",
61
+ " <tbody>\n",
62
+ " <tr>\n",
63
+ " <th>0</th>\n",
64
+ " <td>P50307</td>\n",
65
+ " <td>NaN</td>\n",
66
+ " <td>NaN</td>\n",
67
+ " <td>Gram positive</td>\n",
68
+ " <td>Cytoplasmic</td>\n",
69
+ " <td>Firmicutes</td>\n",
70
+ " <td>Bacilli</td>\n",
71
+ " <td>Staphylococcus aureus</td>\n",
72
+ " <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
73
+ " </tr>\n",
74
+ " <tr>\n",
75
+ " <th>1</th>\n",
76
+ " <td>P01552</td>\n",
77
+ " <td>NaN</td>\n",
78
+ " <td>NaN</td>\n",
79
+ " <td>Gram positive</td>\n",
80
+ " <td>Extracellular</td>\n",
81
+ " <td>Firmicutes</td>\n",
82
+ " <td>Bacilli</td>\n",
83
+ " <td>Staphylococcus aureus</td>\n",
84
+ " <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
85
+ " </tr>\n",
86
+ " <tr>\n",
87
+ " <th>2</th>\n",
88
+ " <td>P09978</td>\n",
89
+ " <td>NaN</td>\n",
90
+ " <td>NaN</td>\n",
91
+ " <td>Gram positive</td>\n",
92
+ " <td>Extracellular</td>\n",
93
+ " <td>Firmicutes</td>\n",
94
+ " <td>Bacilli</td>\n",
95
+ " <td>Staphylococcus aureus</td>\n",
96
+ " <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>3</th>\n",
100
+ " <td>P45723</td>\n",
101
+ " <td>NaN</td>\n",
102
+ " <td>NaN</td>\n",
103
+ " <td>Gram positive</td>\n",
104
+ " <td>Extracellular</td>\n",
105
+ " <td>Firmicutes</td>\n",
106
+ " <td>Bacilli</td>\n",
107
+ " <td>Staphylococcus aureus</td>\n",
108
+ " <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
109
+ " </tr>\n",
110
+ " <tr>\n",
111
+ " <th>4</th>\n",
112
+ " <td>P81177</td>\n",
113
+ " <td>NaN</td>\n",
114
+ " <td>NaN</td>\n",
115
+ " <td>Gram positive</td>\n",
116
+ " <td>Extracellular</td>\n",
117
+ " <td>Firmicutes</td>\n",
118
+ " <td>Bacilli</td>\n",
119
+ " <td>Staphylococcus aureus</td>\n",
120
+ " <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
121
+ " </tr>\n",
122
+ " </tbody>\n",
123
+ "</table>\n",
124
+ "</div>"
125
+ ],
126
+ "text/plain": [
127
+ " SwissProt_ID Refseq_Accession Other_Accession GramStain \\\n",
128
+ "0 P50307 NaN NaN Gram positive \n",
129
+ "1 P01552 NaN NaN Gram positive \n",
130
+ "2 P09978 NaN NaN Gram positive \n",
131
+ "3 P45723 NaN NaN Gram positive \n",
132
+ "4 P81177 NaN NaN Gram positive \n",
133
+ "\n",
134
+ " Experimental_Localization Phylum Class Organism \\\n",
135
+ "0 Cytoplasmic Firmicutes Bacilli Staphylococcus aureus \n",
136
+ "1 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
137
+ "2 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
138
+ "3 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
139
+ "4 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
140
+ "\n",
141
+ " sequence \n",
142
+ "0 MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
143
+ "1 MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
144
+ "2 MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
145
+ "3 MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
146
+ "4 MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... "
147
+ ]
148
+ },
149
+ "execution_count": 4,
150
+ "metadata": {},
151
+ "output_type": "execute_result"
152
+ }
153
+ ],
154
+ "source": [
155
+ "sequences: pd.DataFrame = pd.read_csv('../Data/trainingData.csv')\n",
156
+ "sequences.head()"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": null,
162
+ "id": "07a49fd0",
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_300m\").to(\"cuda\")"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": null,
172
+ "id": "e562c770",
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "# Set up output directories and metadata file.\n",
177
+ "embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm300m/embeddings\")\n",
178
+ "os.makedirs(embeddings_dir, exist_ok=True)"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": null,
184
+ "id": "294c6798",
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "# --- Your provided function ---\n",
189
+ "def embed_sequence(client: ESM3ForgeInferenceClient, sequence: str) -> LogitsOutput:\n",
190
+ " protein = ESMProtein(sequence=sequence)\n",
191
+ " protein_tensor = client.encode(protein)\n",
192
+ " if isinstance(protein_tensor, ESMProteinError):\n",
193
+ " raise protein_tensor\n",
194
+ " output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))\n",
195
+ " return output\n",
196
+ "\n",
197
+ "\n",
198
+ "def save_emb(dir: str, df: pd.DataFrame, client: ESM3ForgeInferenceClient) -> None:\n",
199
+ " dir = os.path.expanduser(dir)\n",
200
+ " os.makedirs(dir, exist_ok=True)\n",
201
+ "\n",
202
+ " for i in tqdm(df.index, desc=\"Embedding sequences\"):\n",
203
+ " try:\n",
204
+ " output: LogitsOutput = embed_sequence(client=client, sequence=df.loc[i, 'sequence'])\n",
205
+ " embeddings_np: np.ndarray = output.embeddings.cpu().numpy()\n",
206
+ "\n",
207
+ " if not pd.isna(df.loc[i, 'SwissProt_ID']):\n",
208
+ " identifier = df.loc[i, 'SwissProt_ID']\n",
209
+ " elif not pd.isna(df.loc[i, 'Refseq_Accession']):\n",
210
+ " identifier = df.loc[i, 'Refseq_Accession']\n",
211
+ " elif not pd.isna(df.loc[i, 'Other_Accession']):\n",
212
+ " identifier = df.loc[i, 'Other_Accession']\n",
213
+ " else:\n",
214
+ " identifier = f\"unknown_{i}\"\n",
215
+ "\n",
216
+ " file_path: str = os.path.join(dir, f\"{identifier}.npy\")\n",
217
+ " np.save(file_path, embeddings_np)\n",
218
+ "\n",
219
+ " del output\n",
220
+ " gc.collect()\n",
221
+ " torch.cuda.empty_cache()\n",
222
+ "\n",
223
+ " except Exception as e:\n",
224
+ " print(f\"Error embedding index {i}: {e}\")"
225
+ ]
226
+ },
227
+ {
228
+ "cell_type": "code",
229
+ "execution_count": null,
230
+ "id": "80db4990",
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "\n",
235
+ " \n",
236
+ "# Pass metadata_writer (and client if needed) to your function\n",
237
+ "save_emb(embeddings_dir, sequences,client = client)\n"
238
+ ]
239
+ },
240
+ {
241
+ "cell_type": "code",
242
+ "execution_count": null,
243
+ "id": "77bf92c6",
244
+ "metadata": {},
245
+ "outputs": [],
246
+ "source": [
247
+ "sequences.loc[[11392]]"
248
+ ]
249
+ },
250
+ {
251
+ "cell_type": "code",
252
+ "execution_count": 9,
253
+ "id": "365d9fdb",
254
+ "metadata": {},
255
+ "outputs": [],
256
+ "source": [
257
+ "sequences = sequences.drop(index=11392)"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "code",
262
+ "execution_count": null,
263
+ "id": "ad8a1990",
264
+ "metadata": {},
265
+ "outputs": [],
266
+ "source": [
267
+ "# Set up output directories and metadata file.\n",
268
+ "embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm600m/embeddings\")\n",
269
+ "os.makedirs(embeddings_dir, exist_ok=True)\n",
270
+ "client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_600m\").to(\"cuda\")"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": null,
276
+ "id": "d42e5263",
277
+ "metadata": {},
278
+ "outputs": [],
279
+ "source": [
280
+ "save_emb(embeddings_dir, sequences,client = client)"
281
+ ]
282
+ },
283
+ {
284
+ "cell_type": "code",
285
+ "execution_count": 2,
286
+ "id": "df91fc10",
287
+ "metadata": {},
288
+ "outputs": [],
289
+ "source": [
290
+ "def load_single_embedding(row, id_col, path):\n",
291
+ " try:\n",
292
+ " emb = np.load(os.path.join(path, f\"{row[id_col]}.npy\"))\n",
293
+ " emb = emb.squeeze(axis=0)\n",
294
+ " emb = np.mean(emb, axis=0)\n",
295
+ " return emb\n",
296
+ " except Exception as e:\n",
297
+ " print(f\"Error loading embedding {row[id_col]} due to {e}\")\n",
298
+ " return None\n",
299
+ "\n",
300
+ "def load_emb_parallel(df: pd.DataFrame, id_col: str, path: str, max_workers=None) -> list:\n",
301
+ " embeddings = []\n",
302
+ " with ProcessPoolExecutor(max_workers=max_workers) as executor:\n",
303
+ " futures = {\n",
304
+ " executor.submit(load_single_embedding, df.loc[i], id_col, path): i for i in df.index\n",
305
+ " }\n",
306
+ "\n",
307
+ " for future in tqdm(as_completed(futures), total=len(futures), desc=\"Loading embeddings\"):\n",
308
+ " emb = future.result()\n",
309
+ " if emb is not None:\n",
310
+ " embeddings.append(emb)\n",
311
+ " return embeddings\n",
312
+ "\n"
313
+ ]
314
+ },
315
+ {
316
+ "cell_type": "code",
317
+ "execution_count": 5,
318
+ "id": "329701f6",
319
+ "metadata": {},
320
+ "outputs": [],
321
+ "source": [
322
+ "sequences['Preferred_ID'] = sequences['SwissProt_ID'].fillna(sequences['Refseq_Accession']).fillna(sequences['Other_Accession'])\n"
323
+ ]
324
+ },
325
+ {
326
+ "cell_type": "code",
327
+ "execution_count": 6,
328
+ "id": "9b720ff2",
329
+ "metadata": {},
330
+ "outputs": [
331
+ {
332
+ "name": "stderr",
333
+ "output_type": "stream",
334
+ "text": [
335
+ "Loading embeddings: 97%|█████████▋| 11377/11691 [05:32<00:10, 31.20it/s]"
336
+ ]
337
+ },
338
+ {
339
+ "name": "stdout",
340
+ "output_type": "stream",
341
+ "text": [
342
+ "Error loading embedding Q9I120 due to [Errno 2] No such file or directory: '/home/jpuglia/Documentos/Tesis/datosGenerados/esm600m/embeddings/Q9I120.npy'\n"
343
+ ]
344
+ },
345
+ {
346
+ "name": "stderr",
347
+ "output_type": "stream",
348
+ "text": [
349
+ "Loading embeddings: 100%|██████████| 11691/11691 [05:40<00:00, 34.29it/s]\n"
350
+ ]
351
+ }
352
+ ],
353
+ "source": [
354
+ "embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm600m/embeddings\")\n",
355
+ "embeddings = load_emb_parallel(sequences, 'Preferred_ID',embeddings_dir)"
356
+ ]
357
+ },
358
+ {
359
+ "cell_type": "code",
360
+ "execution_count": 15,
361
+ "id": "765209e3",
362
+ "metadata": {},
363
+ "outputs": [
364
+ {
365
+ "name": "stdout",
366
+ "output_type": "stream",
367
+ "text": [
368
+ "Embeddings count: 11690\n",
369
+ "Sequences count: 11690\n"
370
+ ]
371
+ }
372
+ ],
373
+ "source": [
374
+ "print(f\"Embeddings count: {len(embeddings)}\")\n",
375
+ "print(f\"Sequences count: {len(sequences)}\")\n"
376
+ ]
377
+ },
378
+ {
379
+ "cell_type": "code",
380
+ "execution_count": 17,
381
+ "id": "63bf7f6c",
382
+ "metadata": {},
383
+ "outputs": [
384
+ {
385
+ "data": {
386
+ "text/plain": [
387
+ "(1152,)"
388
+ ]
389
+ },
390
+ "execution_count": 17,
391
+ "metadata": {},
392
+ "output_type": "execute_result"
393
+ }
394
+ ],
395
+ "source": [
396
+ "embeddings[0].shape"
397
+ ]
398
+ }
399
+ ],
400
+ "metadata": {
401
+ "kernelspec": {
402
+ "display_name": "tesisEnv",
403
+ "language": "python",
404
+ "name": "python3"
405
+ },
406
+ "language_info": {
407
+ "codemirror_mode": {
408
+ "name": "ipython",
409
+ "version": 3
410
+ },
411
+ "file_extension": ".py",
412
+ "mimetype": "text/x-python",
413
+ "name": "python",
414
+ "nbconvert_exporter": "python",
415
+ "pygments_lexer": "ipython3",
416
+ "version": "3.10.16"
417
+ }
418
+ },
419
+ "nbformat": 4,
420
+ "nbformat_minor": 5
421
+ }
notebooks/ESMC_600m.ipynb ADDED
@@ -0,0 +1,256 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "c409c4ad",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from esm.models.esmc import ESMC\n",
11
+ "from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput\n",
12
+ "from esm.sdk.forge import ESM3ForgeInferenceClient\n",
13
+ "from esm.sdk import batch_executor\n",
14
+ "import pandas as pd\n",
15
+ "import os\n",
16
+ "import csv\n",
17
+ "import numpy as np\n",
18
+ "import torch"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 3,
24
+ "id": "7f8f916c",
25
+ "metadata": {},
26
+ "outputs": [
27
+ {
28
+ "data": {
29
+ "text/html": [
30
+ "<div>\n",
31
+ "<style scoped>\n",
32
+ " .dataframe tbody tr th:only-of-type {\n",
33
+ " vertical-align: middle;\n",
34
+ " }\n",
35
+ "\n",
36
+ " .dataframe tbody tr th {\n",
37
+ " vertical-align: top;\n",
38
+ " }\n",
39
+ "\n",
40
+ " .dataframe thead th {\n",
41
+ " text-align: right;\n",
42
+ " }\n",
43
+ "</style>\n",
44
+ "<table border=\"1\" class=\"dataframe\">\n",
45
+ " <thead>\n",
46
+ " <tr style=\"text-align: right;\">\n",
47
+ " <th></th>\n",
48
+ " <th>SwissProt_ID</th>\n",
49
+ " <th>Experimental_Localization</th>\n",
50
+ " <th>Organism</th>\n",
51
+ " <th>Phylum</th>\n",
52
+ " <th>Class</th>\n",
53
+ " <th>GramStain</th>\n",
54
+ " <th>Sequence</th>\n",
55
+ " </tr>\n",
56
+ " </thead>\n",
57
+ " <tbody>\n",
58
+ " <tr>\n",
59
+ " <th>0</th>\n",
60
+ " <td>P50307</td>\n",
61
+ " <td>Cytoplasmic</td>\n",
62
+ " <td>Staphylococcus aureus</td>\n",
63
+ " <td>Firmicutes</td>\n",
64
+ " <td>Bacilli</td>\n",
65
+ " <td>1.0</td>\n",
66
+ " <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
67
+ " </tr>\n",
68
+ " <tr>\n",
69
+ " <th>1</th>\n",
70
+ " <td>P01552</td>\n",
71
+ " <td>Extracellular</td>\n",
72
+ " <td>Staphylococcus aureus</td>\n",
73
+ " <td>Firmicutes</td>\n",
74
+ " <td>Bacilli</td>\n",
75
+ " <td>1.0</td>\n",
76
+ " <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
77
+ " </tr>\n",
78
+ " <tr>\n",
79
+ " <th>2</th>\n",
80
+ " <td>P09978</td>\n",
81
+ " <td>Extracellular</td>\n",
82
+ " <td>Staphylococcus aureus</td>\n",
83
+ " <td>Firmicutes</td>\n",
84
+ " <td>Bacilli</td>\n",
85
+ " <td>1.0</td>\n",
86
+ " <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
87
+ " </tr>\n",
88
+ " <tr>\n",
89
+ " <th>3</th>\n",
90
+ " <td>P45723</td>\n",
91
+ " <td>Extracellular</td>\n",
92
+ " <td>Staphylococcus aureus</td>\n",
93
+ " <td>Firmicutes</td>\n",
94
+ " <td>Bacilli</td>\n",
95
+ " <td>1.0</td>\n",
96
+ " <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
97
+ " </tr>\n",
98
+ " <tr>\n",
99
+ " <th>4</th>\n",
100
+ " <td>P81177</td>\n",
101
+ " <td>Extracellular</td>\n",
102
+ " <td>Staphylococcus aureus</td>\n",
103
+ " <td>Firmicutes</td>\n",
104
+ " <td>Bacilli</td>\n",
105
+ " <td>1.0</td>\n",
106
+ " <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
107
+ " </tr>\n",
108
+ " </tbody>\n",
109
+ "</table>\n",
110
+ "</div>"
111
+ ],
112
+ "text/plain": [
113
+ " SwissProt_ID Experimental_Localization Organism Phylum \\\n",
114
+ "0 P50307 Cytoplasmic Staphylococcus aureus Firmicutes \n",
115
+ "1 P01552 Extracellular Staphylococcus aureus Firmicutes \n",
116
+ "2 P09978 Extracellular Staphylococcus aureus Firmicutes \n",
117
+ "3 P45723 Extracellular Staphylococcus aureus Firmicutes \n",
118
+ "4 P81177 Extracellular Staphylococcus aureus Firmicutes \n",
119
+ "\n",
120
+ " Class GramStain Sequence \n",
121
+ "0 Bacilli 1.0 MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
122
+ "1 Bacilli 1.0 MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
123
+ "2 Bacilli 1.0 MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
124
+ "3 Bacilli 1.0 MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
125
+ "4 Bacilli 1.0 MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... "
126
+ ]
127
+ },
128
+ "execution_count": 3,
129
+ "metadata": {},
130
+ "output_type": "execute_result"
131
+ }
132
+ ],
133
+ "source": [
134
+ "sequences: pd.DataFrame = pd.read_csv('/home/jpuglia/Documentos/Tesis/tesisESM/Data/trainingData.csv')\n",
135
+ "sequences.head()"
136
+ ]
137
+ },
138
+ {
139
+ "cell_type": "code",
140
+ "execution_count": null,
141
+ "id": "d7026979",
142
+ "metadata": {},
143
+ "outputs": [
144
+ {
145
+ "ename": "ValueError",
146
+ "evalue": "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().",
147
+ "output_type": "error",
148
+ "traceback": [
149
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
150
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
151
+ "\u001b[0;32m/tmp/ipykernel_118460/767462261.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0misfloat\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Sequence'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0msequences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;32mnot\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0misfloat\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
152
+ "\u001b[0;32m~/miniconda3/envs/tesisEnv/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1575\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mfinal\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1576\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__nonzero__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mNoReturn\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1577\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1578\u001b[0m \u001b[0;34mf\"The truth value of a {type(self).__name__} is ambiguous. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1579\u001b[0m \u001b[0;34m\"Use a.empty, a.bool(), a.item(), a.any() or a.all().\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1580\u001b[0m )\n",
153
+ "\u001b[0;31mValueError\u001b[0m: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()."
154
+ ]
155
+ }
156
+ ],
157
+ "source": [
158
+ "isfloat: bool = sequences['Sequence'].apply(lambda x:isinstance(x,float))\n",
159
+ "\n",
160
+ "sequences = sequences[~isfloat]"
161
+ ]
162
+ },
163
+ {
164
+ "cell_type": "code",
165
+ "execution_count": null,
166
+ "id": "ea723ad9",
167
+ "metadata": {},
168
+ "outputs": [],
169
+ "source": [
170
+ "sequences = sequences.dropna()\n",
171
+ "sequences = sequences.drop_duplicates()\n",
172
+ "sequences.shape"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": null,
178
+ "id": "07a49fd0",
179
+ "metadata": {},
180
+ "outputs": [],
181
+ "source": [
182
+ "torch.cuda.empty_cache()\n",
183
+ "client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_600m\").to(\"cuda\")"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "code",
188
+ "execution_count": null,
189
+ "id": "294c6798",
190
+ "metadata": {},
191
+ "outputs": [],
192
+ "source": [
193
+ "# Set up output directories and metadata file.\n",
194
+ "embeddings_dir = \"/home/jpuglia/Documentos/Tesis/datosGenerados/esm600m/embeddings\"\n",
195
+ "os.makedirs(embeddings_dir, exist_ok=True)\n",
196
+ "\n",
197
+ "def embed_sequence(client: ESM3ForgeInferenceClient, sequence: str) -> LogitsOutput:\n",
198
+ " \n",
199
+ " protein = ESMProtein(sequence=sequence)\n",
200
+ " protein_tensor = client.encode(protein)\n",
201
+ " if isinstance(protein_tensor, ESMProteinError):\n",
202
+ " raise protein_tensor\n",
203
+ " output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))\n",
204
+ " return output\n",
205
+ "\n",
206
+ "\n",
207
+ "def save_emb(dir: str, df : pd.DataFrame) -> None:\n",
208
+ " \n",
209
+ " for i in df.index:\n",
210
+ " \n",
211
+ " output: LogitsOutput = embed_sequence(client = client, sequence = df.loc[i, 'Sequence'])\n",
212
+ " \n",
213
+ " embeddings_np : np.ndarray = output.embeddings.cpu().numpy()\n",
214
+ " \n",
215
+ " file_path : str = os.path.join(dir,f\"{df.loc[i, 'SwissProt_ID']}.npy\") \n",
216
+ "\n",
217
+ " np.save(file_path, embeddings_np)\n",
218
+ " \n",
219
+ " del output\n",
220
+ " \n",
221
+ " torch.cuda.empty_cache()"
222
+ ]
223
+ },
224
+ {
225
+ "cell_type": "code",
226
+ "execution_count": null,
227
+ "id": "80db4990",
228
+ "metadata": {},
229
+ "outputs": [],
230
+ "source": [
231
+ "save_emb(embeddings_dir, sequences)\n"
232
+ ]
233
+ }
234
+ ],
235
+ "metadata": {
236
+ "kernelspec": {
237
+ "display_name": "tesisEnv",
238
+ "language": "python",
239
+ "name": "python3"
240
+ },
241
+ "language_info": {
242
+ "codemirror_mode": {
243
+ "name": "ipython",
244
+ "version": 3
245
+ },
246
+ "file_extension": ".py",
247
+ "mimetype": "text/x-python",
248
+ "name": "python",
249
+ "nbconvert_exporter": "python",
250
+ "pygments_lexer": "ipython3",
251
+ "version": "3.10.16"
252
+ }
253
+ },
254
+ "nbformat": 4,
255
+ "nbformat_minor": 5
256
+ }
notebooks/EmbAnalisis.ipynb ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1e21abaa9bc06181ad40648ad354596985d284daada49adc7d9c0d17daa6bce5
3
+ size 10632399
notebooks/ProstT5.ipynb ADDED
@@ -0,0 +1,526 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "40b1e04a",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pandas as pd\n",
11
+ "from transformers import T5Tokenizer, T5EncoderModel\n",
12
+ "import torch\n",
13
+ "import re\n",
14
+ "from tqdm.notebook import tqdm\n",
15
+ "import os\n",
16
+ "import numpy as np\n",
17
+ "import gc\n",
18
+ "\n",
19
+ "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n"
20
+ ]
21
+ },
22
+ {
23
+ "cell_type": "code",
24
+ "execution_count": 2,
25
+ "id": "f4c8ff50",
26
+ "metadata": {},
27
+ "outputs": [
28
+ {
29
+ "data": {
30
+ "text/html": [
31
+ "<div>\n",
32
+ "<style scoped>\n",
33
+ " .dataframe tbody tr th:only-of-type {\n",
34
+ " vertical-align: middle;\n",
35
+ " }\n",
36
+ "\n",
37
+ " .dataframe tbody tr th {\n",
38
+ " vertical-align: top;\n",
39
+ " }\n",
40
+ "\n",
41
+ " .dataframe thead th {\n",
42
+ " text-align: right;\n",
43
+ " }\n",
44
+ "</style>\n",
45
+ "<table border=\"1\" class=\"dataframe\">\n",
46
+ " <thead>\n",
47
+ " <tr style=\"text-align: right;\">\n",
48
+ " <th></th>\n",
49
+ " <th>GramStain</th>\n",
50
+ " <th>Experimental_Localization</th>\n",
51
+ " <th>Phylum</th>\n",
52
+ " <th>Class</th>\n",
53
+ " <th>Organism</th>\n",
54
+ " <th>sequence</th>\n",
55
+ " <th>id</th>\n",
56
+ " </tr>\n",
57
+ " </thead>\n",
58
+ " <tbody>\n",
59
+ " <tr>\n",
60
+ " <th>0</th>\n",
61
+ " <td>Gram positive</td>\n",
62
+ " <td>Cytoplasmic</td>\n",
63
+ " <td>Firmicutes</td>\n",
64
+ " <td>Bacilli</td>\n",
65
+ " <td>Staphylococcus aureus</td>\n",
66
+ " <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
67
+ " <td>P50307</td>\n",
68
+ " </tr>\n",
69
+ " <tr>\n",
70
+ " <th>1</th>\n",
71
+ " <td>Gram positive</td>\n",
72
+ " <td>Extracellular</td>\n",
73
+ " <td>Firmicutes</td>\n",
74
+ " <td>Bacilli</td>\n",
75
+ " <td>Staphylococcus aureus</td>\n",
76
+ " <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
77
+ " <td>P01552</td>\n",
78
+ " </tr>\n",
79
+ " <tr>\n",
80
+ " <th>2</th>\n",
81
+ " <td>Gram positive</td>\n",
82
+ " <td>Extracellular</td>\n",
83
+ " <td>Firmicutes</td>\n",
84
+ " <td>Bacilli</td>\n",
85
+ " <td>Staphylococcus aureus</td>\n",
86
+ " <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
87
+ " <td>P09978</td>\n",
88
+ " </tr>\n",
89
+ " <tr>\n",
90
+ " <th>3</th>\n",
91
+ " <td>Gram positive</td>\n",
92
+ " <td>Extracellular</td>\n",
93
+ " <td>Firmicutes</td>\n",
94
+ " <td>Bacilli</td>\n",
95
+ " <td>Staphylococcus aureus</td>\n",
96
+ " <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
97
+ " <td>P45723</td>\n",
98
+ " </tr>\n",
99
+ " <tr>\n",
100
+ " <th>4</th>\n",
101
+ " <td>Gram positive</td>\n",
102
+ " <td>Extracellular</td>\n",
103
+ " <td>Firmicutes</td>\n",
104
+ " <td>Bacilli</td>\n",
105
+ " <td>Staphylococcus aureus</td>\n",
106
+ " <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
107
+ " <td>P81177</td>\n",
108
+ " </tr>\n",
109
+ " </tbody>\n",
110
+ "</table>\n",
111
+ "</div>"
112
+ ],
113
+ "text/plain": [
114
+ " GramStain Experimental_Localization Phylum Class \\\n",
115
+ "0 Gram positive Cytoplasmic Firmicutes Bacilli \n",
116
+ "1 Gram positive Extracellular Firmicutes Bacilli \n",
117
+ "2 Gram positive Extracellular Firmicutes Bacilli \n",
118
+ "3 Gram positive Extracellular Firmicutes Bacilli \n",
119
+ "4 Gram positive Extracellular Firmicutes Bacilli \n",
120
+ "\n",
121
+ " Organism sequence \\\n",
122
+ "0 Staphylococcus aureus MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
123
+ "1 Staphylococcus aureus MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
124
+ "2 Staphylococcus aureus MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
125
+ "3 Staphylococcus aureus MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
126
+ "4 Staphylococcus aureus MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... \n",
127
+ "\n",
128
+ " id \n",
129
+ "0 P50307 \n",
130
+ "1 P01552 \n",
131
+ "2 P09978 \n",
132
+ "3 P45723 \n",
133
+ "4 P81177 "
134
+ ]
135
+ },
136
+ "execution_count": 2,
137
+ "metadata": {},
138
+ "output_type": "execute_result"
139
+ }
140
+ ],
141
+ "source": [
142
+ "sequences_df = pd.read_csv('../Data/trainingData.csv')\n",
143
+ "sequences_df['id'] = sequences_df['SwissProt_ID'].fillna(sequences_df['Refseq_Accession'].fillna(sequences_df['Other_Accession']))\n",
144
+ "sequences_df = sequences_df.drop(columns=['SwissProt_ID', 'Refseq_Accession', 'Other_Accession'])\n",
145
+ "sequences_df.head()"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 3,
151
+ "id": "6925775b",
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "name": "stdout",
156
+ "output_type": "stream",
157
+ "text": [
158
+ "Secuencias 11691\n",
159
+ "Ids 11691\n"
160
+ ]
161
+ }
162
+ ],
163
+ "source": [
164
+ "sequences = list(sequences_df['sequence'].values)\n",
165
+ "accession = list(sequences_df['id'].values)\n",
166
+ "\n",
167
+ "print(f\"Secuencias {len(sequences)}\\nIds {len(accession)}\")"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": 4,
173
+ "id": "c19ac1ba",
174
+ "metadata": {},
175
+ "outputs": [],
176
+ "source": [
177
+ "path = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/prost/embeddings\")"
178
+ ]
179
+ },
180
+ {
181
+ "cell_type": "code",
182
+ "execution_count": 5,
183
+ "id": "5b5e321e",
184
+ "metadata": {},
185
+ "outputs": [
186
+ {
187
+ "name": "stderr",
188
+ "output_type": "stream",
189
+ "text": [
190
+ "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
191
+ ]
192
+ },
193
+ {
194
+ "data": {
195
+ "application/vnd.jupyter.widget-view+json": {
196
+ "model_id": "17d989ac426c445dbfd209d0247a9a3d",
197
+ "version_major": 2,
198
+ "version_minor": 0
199
+ },
200
+ "text/plain": [
201
+ "Processing Sequences: 0%| | 0/11691 [00:00<?, ?it/s]"
202
+ ]
203
+ },
204
+ "metadata": {},
205
+ "output_type": "display_data"
206
+ },
207
+ {
208
+ "name": "stdout",
209
+ "output_type": "stream",
210
+ "text": [
211
+ "Error CUDA out of memory. Tried to allocate 1.64 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.02 GiB is free. Including non-PyTorch memory, this process has 4.11 GiB memory in use. Of the allocated memory 4.00 GiB is allocated by PyTorch, and 10.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba CAC14227\n",
212
+ "Error CUDA out of memory. Tried to allocate 1.54 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.09 GiB is free. Including non-PyTorch memory, this process has 4.03 GiB memory in use. Of the allocated memory 3.89 GiB is allocated by PyTorch, and 36.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba P12255\n",
213
+ "Error CUDA out of memory. Tried to allocate 982.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 748.44 MiB is free. Including non-PyTorch memory, this process has 4.40 GiB memory in use. Of the allocated memory 4.25 GiB is allocated by PyTorch, and 51.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba P20471\n",
214
+ "Error CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 711.88 MiB is free. Including non-PyTorch memory, this process has 4.48 GiB memory in use. Of the allocated memory 4.33 GiB is allocated by PyTorch, and 44.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba A64556\n",
215
+ "Error CUDA out of memory. Tried to allocate 1.28 GiB. GPU 0 has a total capacity of 5.59 GiB of which 111.88 MiB is free. Including non-PyTorch memory, this process has 5.07 GiB memory in use. Of the allocated memory 4.90 GiB is allocated by PyTorch, and 67.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba AAF25576\n",
216
+ "Error CUDA out of memory. Tried to allocate 1.55 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 4.03 GiB memory in use. Of the allocated memory 3.91 GiB is allocated by PyTorch, and 19.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q4L9P0\n",
217
+ "Error CUDA out of memory. Tried to allocate 1.04 GiB. GPU 0 has a total capacity of 5.59 GiB of which 591.88 MiB is free. Including non-PyTorch memory, this process has 4.60 GiB memory in use. Of the allocated memory 4.45 GiB is allocated by PyTorch, and 40.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I5N6\n",
218
+ "Error CUDA out of memory. Tried to allocate 1.49 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.22 GiB is free. Including non-PyTorch memory, this process has 3.95 GiB memory in use. Of the allocated memory 3.84 GiB is allocated by PyTorch, and 5.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I791\n",
219
+ "Error CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 31.88 MiB is free. Including non-PyTorch memory, this process has 5.14 GiB memory in use. Of the allocated memory 5.01 GiB is allocated by PyTorch, and 36.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I120\n"
220
+ ]
221
+ }
222
+ ],
223
+ "source": [
224
+ "# Setup device\n",
225
+ "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
226
+ "\n",
227
+ "# Load tokenizer and model\n",
228
+ "tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)\n",
229
+ "model = T5EncoderModel.from_pretrained(\"Rostlab/ProstT5\").to(device)\n",
230
+ "model.full() if device == 'cpu' else model.half()\n",
231
+ "\n",
232
+ "# Clean sequences\n",
233
+ "sequences = [\" \".join(list(re.sub(r\"[UZOB]\", \"X\", s))) for s in sequences]\n",
234
+ "sequences = [ \"<AA2fold> \" + s for s in sequences]\n",
235
+ "\n",
236
+ "# Process each sequence individually\n",
237
+ "for i, (seq, acc_id) in enumerate(tqdm(zip(sequences, accession), total=len(sequences), desc=\"Processing Sequences\")):\n",
238
+ " try:\n",
239
+ " # Tokenize\n",
240
+ " ids = tokenizer(\n",
241
+ " seq,\n",
242
+ " add_special_tokens=True,\n",
243
+ " return_tensors='pt'\n",
244
+ " ).to(device)\n",
245
+ "\n",
246
+ " # Forward pass\n",
247
+ " with torch.no_grad():\n",
248
+ " embedding_repr = model(\n",
249
+ " ids.input_ids,\n",
250
+ " attention_mask=ids.attention_mask\n",
251
+ " )\n",
252
+ "\n",
253
+ " # Compute actual length (excluding prefix)\n",
254
+ " real_len = ids.attention_mask[0].sum().item() - 1\n",
255
+ "\n",
256
+ " # Extract and average embeddings\n",
257
+ " emb = embedding_repr.last_hidden_state[0, 1:real_len]\n",
258
+ " emb_avg = emb.mean(dim=0).cpu().numpy()\n",
259
+ "\n",
260
+ " # Save embedding using accession ID\n",
261
+ " np.save(os.path.join(path, f\"{acc_id}.npy\"), emb_avg)\n",
262
+ "\n",
263
+ "\n",
264
+ " # Cleanup\n",
265
+ " del ids, embedding_repr, emb, emb_avg\n",
266
+ " torch.cuda.empty_cache()\n",
267
+ " gc.collect()\n",
268
+ "\n",
269
+ " except RuntimeError as e:\n",
270
+ " print(f\"Error {e} mientras se procesaba {acc_id}\")\n",
271
+ "\n"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 6,
277
+ "id": "9ca9cb2d",
278
+ "metadata": {},
279
+ "outputs": [
280
+ {
281
+ "data": {
282
+ "text/html": [
283
+ "<div>\n",
284
+ "<style scoped>\n",
285
+ " .dataframe tbody tr th:only-of-type {\n",
286
+ " vertical-align: middle;\n",
287
+ " }\n",
288
+ "\n",
289
+ " .dataframe tbody tr th {\n",
290
+ " vertical-align: top;\n",
291
+ " }\n",
292
+ "\n",
293
+ " .dataframe thead th {\n",
294
+ " text-align: right;\n",
295
+ " }\n",
296
+ "</style>\n",
297
+ "<table border=\"1\" class=\"dataframe\">\n",
298
+ " <thead>\n",
299
+ " <tr style=\"text-align: right;\">\n",
300
+ " <th></th>\n",
301
+ " <th>GramStain</th>\n",
302
+ " <th>Experimental_Localization</th>\n",
303
+ " <th>Phylum</th>\n",
304
+ " <th>Class</th>\n",
305
+ " <th>Organism</th>\n",
306
+ " <th>sequence</th>\n",
307
+ " <th>id</th>\n",
308
+ " </tr>\n",
309
+ " </thead>\n",
310
+ " <tbody>\n",
311
+ " <tr>\n",
312
+ " <th>1532</th>\n",
313
+ " <td>Gram negative</td>\n",
314
+ " <td>OuterMembrane,Extracellular</td>\n",
315
+ " <td>Proteobacteria</td>\n",
316
+ " <td>Gammaproteobacteria</td>\n",
317
+ " <td>Yersinia pestis</td>\n",
318
+ " <td>MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQ...</td>\n",
319
+ " <td>CAC14227</td>\n",
320
+ " </tr>\n",
321
+ " <tr>\n",
322
+ " <th>1683</th>\n",
323
+ " <td>Gram negative</td>\n",
324
+ " <td>OuterMembrane</td>\n",
325
+ " <td>Proteobacteria</td>\n",
326
+ " <td>Betaproteobacteria</td>\n",
327
+ " <td>Bordetella pertussis</td>\n",
328
+ " <td>MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATS...</td>\n",
329
+ " <td>P12255</td>\n",
330
+ " </tr>\n",
331
+ " <tr>\n",
332
+ " <th>1767</th>\n",
333
+ " <td>Gram negative</td>\n",
334
+ " <td>CytoplasmicMembrane</td>\n",
335
+ " <td>Proteobacteria</td>\n",
336
+ " <td>Alphaproteobacteria</td>\n",
337
+ " <td>Sinorhizobium meliloti</td>\n",
338
+ " <td>MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKG...</td>\n",
339
+ " <td>P20471</td>\n",
340
+ " </tr>\n",
341
+ " <tr>\n",
342
+ " <th>4089</th>\n",
343
+ " <td>Gram negative</td>\n",
344
+ " <td>OuterMembrane,Extracellular</td>\n",
345
+ " <td>Proteobacteria</td>\n",
346
+ " <td>Epsilonproteobacteria</td>\n",
347
+ " <td>Helicobacter pylori</td>\n",
348
+ " <td>MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLW...</td>\n",
349
+ " <td>A64556</td>\n",
350
+ " </tr>\n",
351
+ " <tr>\n",
352
+ " <th>4623</th>\n",
353
+ " <td>Gram positive</td>\n",
354
+ " <td>Cellwall</td>\n",
355
+ " <td>Firmicutes</td>\n",
356
+ " <td>Bacilli</td>\n",
357
+ " <td>Lactobacillus reuteri</td>\n",
358
+ " <td>MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGAT...</td>\n",
359
+ " <td>AAF25576</td>\n",
360
+ " </tr>\n",
361
+ " </tbody>\n",
362
+ "</table>\n",
363
+ "</div>"
364
+ ],
365
+ "text/plain": [
366
+ " GramStain Experimental_Localization Phylum \\\n",
367
+ "1532 Gram negative OuterMembrane,Extracellular Proteobacteria \n",
368
+ "1683 Gram negative OuterMembrane Proteobacteria \n",
369
+ "1767 Gram negative CytoplasmicMembrane Proteobacteria \n",
370
+ "4089 Gram negative OuterMembrane,Extracellular Proteobacteria \n",
371
+ "4623 Gram positive Cellwall Firmicutes \n",
372
+ "\n",
373
+ " Class Organism \\\n",
374
+ "1532 Gammaproteobacteria Yersinia pestis \n",
375
+ "1683 Betaproteobacteria Bordetella pertussis \n",
376
+ "1767 Alphaproteobacteria Sinorhizobium meliloti \n",
377
+ "4089 Epsilonproteobacteria Helicobacter pylori \n",
378
+ "4623 Bacilli Lactobacillus reuteri \n",
379
+ "\n",
380
+ " sequence id \n",
381
+ "1532 MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQ... CAC14227 \n",
382
+ "1683 MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATS... P12255 \n",
383
+ "1767 MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKG... P20471 \n",
384
+ "4089 MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLW... A64556 \n",
385
+ "4623 MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGAT... AAF25576 "
386
+ ]
387
+ },
388
+ "execution_count": 6,
389
+ "metadata": {},
390
+ "output_type": "execute_result"
391
+ }
392
+ ],
393
+ "source": [
394
+ "cpu_ids = [\n",
395
+ " 'CAC14227',\n",
396
+ " 'P12255',\n",
397
+ " 'P20471',\n",
398
+ " 'A64556',\n",
399
+ " 'AAF25576',\n",
400
+ " 'Q4L9P0',\n",
401
+ " 'Q9I5N6',\n",
402
+ " 'Q9I791',\n",
403
+ " 'Q9I120'\n",
404
+ "]\n",
405
+ "\n",
406
+ "cpu_sequences = sequences_df[sequences_df['id'].isin(cpu_ids)]\n",
407
+ "cpu_sequences.head()\n"
408
+ ]
409
+ },
410
+ {
411
+ "cell_type": "code",
412
+ "execution_count": 7,
413
+ "id": "a919beeb",
414
+ "metadata": {},
415
+ "outputs": [
416
+ {
417
+ "name": "stdout",
418
+ "output_type": "stream",
419
+ "text": [
420
+ "['MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQSMVSKLFRKNLLALSLGSIVFLSTGPVFAADITVSTQAELSAALSNGTYDKIILGADITLIGSLTVNMTSNQVVIDGQGKFGLTVNNTTNYGLVVSSGSGTLTLQNMSKIDSANYYSMVVLNGANTAVNVIYNNIDFLGSSQLIYMGAYGAATNSIMTFGDILNDVVVNDRAQEIGEVNKLAFTGRFHVTHTGSSVTSFVSTGGANNTSTMDFASGADVKIDRTGSTGDLTSTGVNAFAYTFADGASFELIANQNVFSGTTTNRGLEIGSYNSIDGFGSGVKIVLQSRSDGSIISGNGIDNATTNAAGINNNASGDANVIYNLGTGSILKATNTGILATKNANNASDIYIRSAGDITAATGISATHNGTGTVKIKNDGTITSTTAGIAISSASIKEISVDNTDGTITATAGTGVNVLASAILNLFGGTINTSATANGITFAGTEGGHTLTDLTINLLGTGIALSNVAGVNLTLSNVTLNTLNGTALNSLTGLTLVDSLNGRNTINIEGAGIGIAATNTELNTFDAEALDINVNGAGIGIQATGGGVNLSASNLIINVANTLGTALQITDGIDNTTTIGNEIQLNAENATAINFLGSSSKTLNNNGTIKGSVIFAGVADHIINNNGTLDGTLTTGAGNDTLVLDSSSQSNDVINLGDGNNSVTIQNGATVSSIITGNGNDTFTINGMSVGSTYLGSLDAGTGLNTXNXXASTDELAAATSLQGFTNINLVDSHITLVSDDNIGSGMVNIDSSSELLFGSTFDGILHATLGAGTGSAIVNNSANVSLEQASMFAGTWQVNQGGALTASNSNQLGSAKIGLDGTLNLDNIALFNHVLTGNGTLNVAKNLATTAFDFGSTVGGAFSGIVNLTKTTFALSADNAAALASATLKLSDDSVTTVGTTDRTLHGLDLSGGTLIFDGAVPQSQTSGVVTVTDLALNSGTVNITGSGSWDNTDPLATNVSILEQDRAGSTLELINATNVTGDIDALDLLVNGTAITSGTQGVQSAIQQGGSTVANAIHNYGLASSNSNGDSGLYVNYTLSALELLADGADALLLATESGLTANRVLNAELFGVGGLVVDAQNGALTLANGSNRYEGTTTVTAGELILGANGAFGQTSLLDIASGASANINGYSQTVGAVTNVGTVTLGSGGVLTSGLLTNGGILDLTGGALNLTXGGASTVAGGLTGAGTLNINGGNLSVSAANSGLSGQTHIADVASVTLTDTGTLGTSAVEVLGTLNLNGANAAMTNVLSGDGTINTNAAVTLSGNNSFSGAHQIGTDGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGAGDTLSLSGFNGTFGNSVTGSGVLQVTDDAEVTLTSSNGVSNAVTIDIADATLNLDDIALFNHVLTGNGLLNVAKNDASTAFDFGSTVGGAFSGIVNLTNTTFALSADNAAALARATLKLSDDSVTTVGATDRTLHGLDLNGGTLIFDGSPPQSQANGVVTVTDLALNSGTISITGAGNWENEHPVTPPNVSLLEQDRGDILLELINAANVTGNANNLDLLVDGTAITSGTQGVESAIQQGGSTVANAIHNYGLTSSNGNGGSGLYVNYTLSALELLANGANALLLATESGLTANRVLNAELFGVGGLVVDAQNGALTLANGNNRYEGTTTVTAGELILGANGAFGQTSLLNIASGASANINGYRQTVGAVTNSGAVTLGNGGVLTSGLLTNGGILDLTGGALNLAAGGSSTVAGGLTGAGTLNINGGDLAVSATNSGLSGQTHIADVASVTLTGTGTLGTSAVEVLGTLNLNGANAAMTNVLSGGGVINTNAAVTLSGNNSFSGAHQIGTDGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGAGDTLSLSGFNXTFGNSVTGSGVLQVTDDAEVTLTSSNXVGNTVKVDIADATLYVNDIALLDHVLTENGTLNVAKYLATTAFDXGSTVGXXFSGIVNLTNTTFALSADNAAALARATLKLSDDSVTTVGTTDRILHGLDLNGGTLIFDGSPPQSQANGVVTVTDLALNSGTISITGAGNWENEHPVTPPNVSLLEQDRGDILLQLIDADNVTGNANDLELMINGTTISAGQGVQSTVQQGGYTVANATHNYGMTSNGGSGLYVNYTLSALELLADGANALLLATESGLTANRELNAELSGVGGLVVDAQNGALTLANGNNRYEGTTTVTAGELILGANGAFGQTSLLNIASGASANINGYRQTVGAVTNTGTVTLGNGGELTSTDTLINTGMINVTDGILNLENGGASSISGGLTGNGILNIKGGDFTISIDNNGLAGQTNISDGASVTLGNGGTIIGTGNLGSSVIDVLGDLNLVADNSLANVISGDGTINTTATVTLSGNSSFSGAHQIGTNGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGTGDTLSLSGFNGTFGNSVTGSGVLQVTDDAEVTLTSSNGVSNAVTIDIADATLNLDDIALFNHALTGNGLLNVAKNDASTAFDFGATVGGAFTGTVNLNNSTFDLSGNNTTVLAQATLKLSSGNLTSVGNGVQNIGTLAMNGGTLLFDNIVDNAGIITSDGTIAANSINTTGGGEVRVNLPSNLAPSLDGLSVMELDEGEIIVTLATGAATGTGHELTLTDENGDPISAVTYQGVHNAGSTSAAATGSFNYGMTTGEDYDGLYVNYGLTALELLSTGSEALVLTAILANNGTQSNDLSAQITGSGDLAFASANDGSTASLSNSTNSYTGTTWVSSGNLRLDADSALGQTSLLAMSTATHVDINGTQQVVGELATEGGSTLDLNDGKLTVTGGGQIDGALTGGGELVLSGGLLNVSYDNAGFTGSTDIANGAVAHLSQAQGLGNGTINNNGTLHLDNTIGTLFNALTGSDGEVLLSNNASVQLAGDNSGYSGLFTNQAGSILIANSAEHLGGSSIANSGALILDTGSVWELTNTISGTGTLVKRGSGTVKIEGDTVSAGLTTIEEGLLQLGSSAVTQTLSLEESLQERALLVSFASNMANLTSNVLITANGSLGGYGQVTGNVENYGNLIMPNALTGGDFGTFTIDGNYTGDEGMITFNTILAGDTSVTDRLVITGDTAGQSYVTVNNIGGVGARTFEGIKIIDVGGDSAGQFTLNGRAVGGAYEYFLYQGGASTPDDGNWYLRTEADDRRPEPASYTANLAAANNMFVTSLADRMGETLYTDVFTGEQKTTSLWLRNEGSHNRSRDDSGELKTQDNRYVMQLGGDVAQWSRNAQDLWRVGVMAGYANSSSSTVAQVAGYRSTGSVDGYSVGIYGSWLADNADDTGAYVDSWVQYSWFDNRVSGQDLATEKYDSKGFTASVEGGYAFKVGESVNQSYFIQPKAQVVWMGVKADDHTETNGTVISGDGNGNIQTRLGAKAFINPSDKAKVSGPAFKPFVEANWIHNTKDFGTTLDGVTVKQAGTANIAELKLGVDGQVNSQLNLWGNIGQQVGNKGYSETSVVLGVKYNF', 'MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATSLSVAPNALAWALMLACTGLPLVTHAQGLVPQGQTQVLQGGNKVPVVNIADPNSGGVSHNKFQQFNVANPGVVFNNGLTDGVSRIGGALTKNPNLTRQASAILAEVTDTSPSRLAGTLEVYGKGADLIIANPNGISVNGLSTLNASNLTLTTGRPSVNGGRIGLDVQQGTVTIERGGVNATGLGYFDVVARLVKLQGAVSSKQGKPLADIAVVAGANRYDHATRRATPIAAGARGAAAGAYAIDGTAAGAMYGKHITLVSSDSGLGVRQLGSLSSPSAITVSSQGEIALGDATVQRGPLSLKGAGVVSAGKLASGGGAVNVAGGGAVKIASASSVGNLAVQGGGKVQATLLNAGGTLLVSGRQAVQLGAASSRQALSVNAGGALKADKLSATRRVDVDGKQAVALGSASSNALSVRAGGALKAGKLSATGRLDVDGKQAVTLGSVASDGALSVSAGGNLRAKQLVSSAQLEVRGQREVALDDASSARGMTVVAAGALAARNLQSKGAIGVQGGEAVSVANANSDAELRVRGRGQVDLHDLSAARGADISGEGRVNIGRARSDSDVKVSAHGALSIDSMTALGAIGVQAGGSVSAKDMRSRGAVTVSGGGAVNLGDVQSDGQVRATSAGAMTVRDVAAAADLALQAGDALQAGFLKSAGAMTVNGRDAVRLDGAHAGGQLRVSSDGQAALGSLAAKGELTVSAARAATVAELKSLDNISVTGGERVSVQSVNSASRVAISAHGALDVGKVSAKSGIGLEGWGAVGADSLGSDGAISVSGRDAVRVDQARSLADISLGAEGGATLGAVEAAGSIDVRGGSTVAANSLHANRDVRVSGKDAVRVTAATSGGGLHVSSGRQLDLGAVQARGALALDGGAGVALQSAKASGTLHVQGGEHLDLGTLAAVGAVDVNGTGDVRVAKLVSDAGADLQAGRSMTLGIVDTTGDLQARAQQKLELGSVKSDGGLQAAAGGALSLAAAEVAGALELSGQGVTVDRASASRARIDSTGSVGIGALKAGAVEAASPRRARRALRQDFFTPGSVVVRAQGNVTVGRGDPHQGVLAQGDIIMDAKGGTLLLRNDALTENGTVTISADSAVLEHSTIESKISQSVLAAKGDKGKPAVSVKVAKKLFLNGTLRAVNDNNETMSGRQIDVVDGRPQITDAVTGEARKDESVVSDAALVADGGPIVVEAGELVSHAGGIGNGRNKENGASVTVRTTGNLVNKGYISAGKQGVLEVGGALTNEFLVGSDGTQRIEAQRIENRGTFQSQAPAGTAGALVVKAAEAIVHDGVMATKGEMQIAGKGGGSPTVTAGAKATTSANKLSVDVASWDNAGSLDIKKGGAQVTVAGRYAEHGEVSIQGDYTVSADAIALAAQVTQRGGAANLTSRHDTRFSNKIRLMGPLQVNAGGAVSNTGNLKVREGVTVTAASFDNETGAEVMAKSATLTTSGAARNAGKMQVKEAATIVAASVSNPGTFTAGKDITVTSRGGFDNEGKMESNKDIVIKTEQFSNGRVLDAKHDLTVTASGQADNRGSLKAGHDFTVQAQRIDNSGTMAAGHDATLKAPHLRNTGQVVAGHDIHIINSAKLENTGRVDARNDIALDVADFTNTGSLYAEHDATLTLAQGTQRDLVVDQDHILPVAEGTLRVKAKSLTTEIETGNPGSLIAEVQENIDNKQAIVVGKDLTLSSAHGNVANEANALLWAAGELTVKAQNITNKRAALIEAGGNARLTAAVALLNKLGRIRAGEDMHLDAPRIENTAKLSGEVQRKGVQDVGGGEHGRWSGIGYVNYWLRAGNGKKAGTIAAPWYGGDLTAEQSLIEVGKDLYLNAGARKDEHRHLLNEGVIQAGGHGHIGGDVDNRSVVRTVSAMEYFKTPLPVSLTALDNRAGLSPATWNFQSTYELLDYLLDQNRYEYIWGLYPTYTEWSVNTLKNLDLGYQAKPAPTAPPMPKAPELDLRGHTLESAEGRKIFGEYKKLQGEYEKAKMAVQAVEAYGEATRRVHDQLGQRYGKALGGMDAETKEVDGIIQEFAADLRTVYAKQADQATIDAETDKVAQRYKSQIDAVRLQAIQPGRVTLAKALSAALGADWRALGHSQLMQRWKDFKAGKRGAEIAFYPKEQTVLAAGAGLTLSNGAIHNGENAAQNRGRPEGLKIGAHSATSVSGSFDALRDVGLEKRLDIDDALAAVLVNPHIFTRIGAAQTSLADGAAGPALARQARQAPETDGMVDARGLGSADALASLASLDAAQGLEVSGRRNAQVADAGLAGPSAVAAPAVGAADVGVEPVTGDQVDQPVVAVGLEQPVATVRVAPPAVALPRPLFETRIKFIDQSKFYGSRYFFEQIGYKPDRAARVAGDNYFDTTLVREQVRRALGGYESRLPVRGVALVAKLMDSAGTVGKALGLKVGVAPTAQQLKQADRDFVWYVDTVIDGQKVLAPRLYLTEATRQGITDQYAGGGALIASGGDVTVNTDGHDVSSVNGLIQGRSVKVDAGKGKVVVADSKGAGGGIEADDEVDVSGRDIGIEGGKLRGKDVRLKADTVKVATSMRYDDKGRLAARGDGALDAQGGQLHIEAKRLETAGATLKGGKVKLDVDDVKLGGVYEAGSSYENKSSTPLGSLFAILSSTTETNQSAHANHYGTRIEAGTLEGKMQNLEIEGGSVDAAHTDLSVARDARFKAAADFAHAEHEKDVRQLSLGAKVGAGGYEAGFSLGSESGLEAHAGRGMTAGAEVKVGYRASHEQSSETEKSYRNANLNFGGGSVEAGNVLDIGGADINRNRYGGAAKGNAGTEEALRMRAKKVESTKYVSEQTSQSSGWSVEVASTASARSSLLTAATRLGDSVAQNVEDGREIRGELMAAQVAAEATQLVTADTAAVALSAGISADFDSSHSRSTSQNTQYLGGNLSIEATEGDATLVGAKFGGGDQVSLKAAKSVNLMAAESTFESYSESHNFHASADANLGANAVQGAVGLGLTAGMGTSHQITNETGKTYAGTSVDAANVSIDAGKDLNLSGSRVRGKHVVLDVEGDINATSKQDERNYNSSGGGWDASAGVAIQNRTLVAPVGSAGFNFNTEHDNSRLTNDGAAGVVASDGLTGHVKGDANLTGATIADLSGKGNLKVDGAVNAQNLKDYRDKDGGSGGLNVGISSTTLAPTVGVAFGRVAGEDYQAEQRATIDVGQTKDPARLQVGGGVKGTLNQDAAQATVVQRNKHWAGGGSEFSVAGKSLKKKNQVRPVETPTPDVVDGPPSRPTTPPASPQPIRATVEVSSPPPVSVATVEVVPRPKVETAQPLPPRPVAAQVVPVTPPKVEVAKVEVVPRPKVETAQPLPPRPVVAEKVTTPAVQPQLAKVETVQPVKPETTKPLPKPLPVAKVTKAPPPVVETAQPLPPVKPQKATPGPVAEVGKATVTTVQVQSAPPKPAPVAKQPAPAPKPKPKPKPKAERPKPGKTTPLSGRHVVQQQVQVLQRQASDINNTKSLPGGKLPKPVTVKLTDENGKPQTYTINRREDLMKLNGKVLSTKTTLGLEQTFRLRVEDIGGKNYRVFYETNK', 'MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKGTSALPGFFPFEFRARHRENEKEILRVYRATAADVEAGASITPAAEWLLDNHHVVEEAIQEVRRDFPRRFYRQLPTLSVSGTVIPRTMALAWLYVAHTHSTVTRESITAMVEGFQEHETLKIGELWALPSILRFVLIENLRRIAIRVERSRGMRRKANEVADQLIRLNDPEGCRTLLVESEALAADNTFIAQLLYRMRDGSQSSGAVIAWIEERLERRGTDVEEALVAEQNRLSSGNATMSNIIRSLREIDDTDWAVWFESVSKIDATLREGSDYAALDFGSRNTYRDTIEKLARRSGHSEHEVTEIAIEMVEEAKAAAAVEAPLQEPNVGSFLVGKQRLALEKRIGYSPSIFQHLIRSVRKLDWFAIAGPNILLTILAMIVVYAFVSPMDIPSGAKLIMLLLFALPASEGAMGLFNTVFTLFAKPSRLVGYEFLDGIPEDARTLVVVPCLIAKRDHVDELVRNLEVHYLANPRGEIYFALLSDWADSKSEEAPADTDVLEYAKREIASLSARYAYDGKTRFFLLHRRRLYNEAEGVWMGWERKRGKLHELNLLLRGDRDTSFLQGANMVPEGVQYVMTLDSDTRLMRDAVTKLVGKLYHPINRPVVNPRTQEVVTGYSLLQPRVTPSLTTGSEASAFQRIFTINRGIDPYVFTVSDVYQDIAGEGSFTGKGLYHVDAFEAALKSRIEENAVLSHDLLEGSYARCALVTDIELVEDFPIRYEVEMSRQHRWARGDWQLLPYIFNPKNGLSMLGRWKMYDNLRRSLIPVAWLAASVMGWYYMEPTPALIWQLVLIFSLFVAPTLSLISGIMPRRNDIVARAHLHTVLSDIRAANAQVALRIVFIAHNAAMMADAIVRSLYRTFVSRKLMLEWRTAAQVQSAGHGSIGDYFRAMWTAPALALVSLALAAISDTGLPFIGLPFALIWAASPAVAWFVSQSAETEDQLVVSEEAIEEMRKIARRTWRYFEAFVTAEQNFLPPDNFQETPQPVLAERTSPTNIGVYLLSVMSARSFGWIGFEETITRLEQTIATIDRMPKYRGHLFNWYRTRGLEPMEPRYVSSVDSGNLAGHLIAVSSMCREWAEAPSAHVQGNLDGIGDVAAILKEALNELPDDRKTVRPLRRLVEERIAGFQNALAAVKRERELASIRVINLAVLARDMHKLTVNLDHEVRTVQSGEVATWAGSLVAACEAHIADGVFDLGAIEALRQRLLVLKERARDIAFSMDFSFLFRPERRLLSIGYRVNANELDEACYDLLASEARLTSLFAIAKGDLPTEHWYKLGRPIVPIGARGALVSWSGSMFEYLMPPLVMQERQGGILNQTNNLVVQEQINHGRRLGTPWGISEAAFNARDHELTYQYTNFGVPTLGLKRGLGQNAVIAPYASILACMYDPKSALANLARLREVGALGAYGYHDAVDFTPTRVPEGQKCAVVRNYYAHHHGMSVAAVANVVFNGQLREWFHADPVIEAAELLLQEKAPRDIPVMAAKREPEALGKGQADLLRPEVRVVEDPINQDRETVLLSNGHYSVMLTATGAGYARWNGQSVTRWTPDPVEDRTGTFIFLRDTVTGDWWSATAEPRRAPGEKTVTRFGDDKAEFVKTVGDLTSEVECIVATEHDAEGRRVILLNTGTEDRFIEVTSYAEPVLAMDDADSSHPTFSKMFLRTEISRHGDVIWVSRNKRSPGDPDIEVAHLVTDNAGSERHTQAETDRRRFLGQGRTLAEAAAFDPGATLSGTDGFTLDPIVSLRRVVRVPAGKKVSVIFWTIAAPDREGVDRAIDRYRHPETFNHELIHAWTRSQVQMRHVGITSKEAASFQMLGRYLVYPDMHLRADAETVKTGLASQSALWPLAISGDFPIFCLRINDDGDLGIAREALRAQEYLRARGITADLVVVNERASSYAQDLQHTLDSMCENLRLRGLSDGPRQHIFAVRRDLMEPETWSTLISASRAVFHARNGTISDQIARATSLYSKSSEKKEEGAEMLLPVIREADARTAVELDGGDLDFWNGFGGFAEDGREYAVRLRGGEATPQPWINVISNEQFGFHVSAEGAAFSWSRNSRDYQLTPWTNDAVVNRPGEAIFVRDMASGAVLTPYAALSRRKSALFETRHGLGYSRFLSTQDELEIEAMHTVHRTLPAKLVRLTIRNRSSAARKLRVYGYAEWVLGNNRSRTAPFVLSEWDESAKTLVATNPYSIDYPGRCAFFASDGDIAGYTASRREFLGRAGGILAPQAVISGAELTGSTDVDGDACAALATDITVEAGVERQVTFFLGDADNPDQVRAVLEELRADSFGAALEAAKAFWGDFTGVVKVETPDRAFNHMINHWLPYQALGCRIMARSAFYQASGAFGFRDQLQDTLAFLIHRPALARAQILNAAARQFVEGDVQHWWLPGTDAGVRTMISDDVVWLAHAVAHYCAVTGEEDILKEKVPFITGPALEEGQHDSFYKPDVADEVGDVYEHCARALDLAIHRTGANGLPLILGGDWNDGMNRVGEAGEGTSVWLGWFLAGTLRAFLPYARARKDKPRVALWERHLEALKDALEQAGWDGDYYRRGYYDDDTPLGSAENGECRIDSIAQSWSTLSGEGDKERSLRAMDAVMAELVDPEKRIVRLFTPPLETTKQDPGYIKAYPPGVRENGGQYTHAATWVVLAFAAQERAEEAWRTFRMLNPVSHALSQVDAEHYRVEPYVVAADIYGEGALAGRGGWTWYTGSAGWLYRAGVEGILGIRKRGDKLLIRPVLPSEWPGYSAEVRVNGTTHRISVSRDSKSGEPVVSVNNSVTKNAHEGVLL', 'MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLWDLLNPKVGGEYVHWVKGSQYCAWWEFAGCLKNVWGANHKGYDAGNAANYLSSQNYQAISVGSGNETGTYSLSGFTNYVGGNLTINLGNSVVLDLSGSNSFTSYQGYNQGKDDVTFTVGAINLNGTLEVGNRVGSGAGTHTGTATLNLNANKVNINSNINAYKTSQVNIGNANSVITIGSVSLSGDVCSSLASVGIGANCSTSGPSYSFKGTTNATNTAFSNASGSFTFEENATFSGAKWNGGTYTFNKEFSATNNTAFSSGSFNFKGVSSFNGTSFSNASYTFDNQATFQNSSFNGGTFTFNNQTNPTNNAQHPQIQNSSFSGNATTLKGFVNFQQAFNNSNHQLTIQNASFNNATFNNTGKITIEKDASFNNTTFNTSVDTNNMSVTGGVTLSGKNDLKNGSTLDFGSSKITLAQGTTFNLTSLGSEKSVTILNSSGGITYSNLLNHAINGLTSALKTNESLSNPQSFAQGLWDIITYNGVTGQLLNENAATSKPTDSSPSKSSTNSTQVYQVGYKIGDTIYKLQETFSHNSIIIQALESGTYTPPPVINGSKFDLSASNYINADMPWYDHKYYIPKSQNFTESGTYYLPSVQIWGSYTNSFKQTFSANGSNLVIGYNSTWTDHNVSSSGTVSFGDTSGSALNGHCGPWPYYQCTGTTNGTYSAYHVYITANLRSGNRIGTGGAANLIFNGVDSINIANATITQHNAGIYSSSMTFSTQSMDNSQNLNGLNSNGKLSVYGTTFTNEAKDGKFIFNAGQAVFENTNFNGGSYQFSGDSLNFSNNNQFNSGSFEISAKNASFNNANFNNSASFNFNNSNATTSFVGDFTNANSNLQIAGNAVFGNSTNGSQNTANFNNTGSVNISGNATFDNVVFNGPTNTSVKGQVTLNNITLKNLNAPLSFGDGTITFNAHSVINIAESITNGNPITLVSSSKEIEYNNAFSKNLWQLINYQGHGASSEKLVSSAGNGVYDVVYSFNNQTYNFQEVFSQNSISIRRLGVNMVFDYVDMEKSDHLYYQNALGFMTYMPNSYNNNLGNANNTIYYYDKSIDFYASGKTLFTKAEFSQTFTGQNSAIVFGAKSIWTSLSDAPQSNTIIRFGDNKGAGSNDASGHCWNLQCIGFITGHYEAQKIYITGSIESGNRISSGGGASLNFNGLQGILLTNATLYNRAAGTQSSSMNFISNSANIQAQNSYFIDDTAQNGGNPNFSFNALNLDFSNSSFRGYVGKTQSVFKFNAKNAISFTNSTNLSSGLYQMQAKSVLFDNSNLSVSVGTSSIKANAINLSQNASINASNHSTLELQGDLNVNDTSSLNLNQSTINVSNNATINDYASLIASNGSHLNFNGAVNFNSANITTSLNNSSIVFKGAVSLGGQFNLSNNSSLDFQGSSAITSNTAFNFYDNAFSQSPITFHQALDIKAPLSLGGNLLNPNNSSVLDLKNSQLVFGDQGSLNIANIDLLSDLNDNKNRVYNIIQADMNSNWYERISFFGMHINDGIYDAKNQTYSFTNPLNNALKITESFKDNQLSVTLSQIPGIKNTLYNIGSEIFNYQKVYNNANGVYSYSDDAQGVFYLTSNVKGYYNPNQSYQASGSNNTTKNNNLTSESSIISQTYNAQGNPISALHIYNKGYNFNNIKALGQMALKLYPEIKKVLGNDFSPSSLNALNSNALNQLTKLITPNDWKNINELIDNANNSVVQNFNNGTLIVGATQIGQTDTNSAVVFGGLGYQTPCDYTDIVCQKFRGTYLGQLLESSSADLGYIDTTFNAKEIYLTGTLGSGNAWGTGGSASVTFNSQTSLILNQANIVSSQTDGIFSMLGQEGINKVFNQAGLANILGEVAVQSINKAGGLGNLIVNTLGSNSVIGGYLTPEQKNQTLSQLLGQNNFDNLMNDSGLNTAIKDLIRQKLGFWTGLVGGLAGLGGIDLQNPEKLIGSMSINDLLSKKGLFNQITGFISANDIGQVISVMLQDIVKPSNALKNDVAALGKQMIGEFLGQDTLNSLESLLQNQQIKSVLDKVLAAKGLGPIYEQGLGDLIPNLGKKGLFAPYGLSQVWQKGDFSFNAQGNVFVQNSTFSNANGGTLSFNAGNSLIFAGNNHIAFTNHAGTLQLLSDQVSNINITTLNASNGLKINAANNNVSVSQGNLFVSASCAQQSDPTTANIANPCALSAQSTNGASSNNASNNAPIALSNNDESLMVAANDFNFSGNIYANGVVDFSKIKGSANIKNLYLYNNAQFQANNLTISNQAVLEKNASFVTNNLNIQGAFNNNATQKIEVLQNLVIASNASLSTGIYGLEVGGALNNSGAIHFNLENTQTPTPLIQAEGIINLNTTQTPFMNVNNSMANNTTYTLLKSSRYIDYNINPNSLQSYLNLYTLININGNHIEEKNGALTYLGQRVLLQDKGLLLSVALPNSNNASQNNILSLSVLYNQVKMSCGDKAMDFTPPTLQDYIVGIQGQSALNQIEAVGGNAIKWLSTLMMETKENPFFAPIYLKNHSLNEILGVTKDLQNTASLISNPNFRDNATNLLELASYTQQTSRLTKLSDFRSREGESDFSLLELKNKRFSDPNPEVFVKYSQLSKHPNNLWVQGVGGASFISGGNGTLYGLNAGYDRLVKNVILGGYVAYGYSDFNGNIMHSLGNNVDVGMYARAFLKRNEFTLSANETYGGNATSINSSNSLLSVLNQRYNYNTWTTSVNGNYGYDFMFKQKSVVLKPQVGLSYHFIGLSGMKGNDAAYKQFLMHSNPSNESVLTLNMGLESRKYFGKNSYYFVTARLGRDLLIKSKGSNTVRFVGENTLLYRKGEVFNTFASVITGGEMHLWRLVYVNAGVGLKMGLQYQDINITGNVGMRVAF', 'MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGATAQAATTESNASAKTEQVVQQNSTSAASDSTSTSNSSAAVSTSSATPVSTESASSMTVSDLPASASAASDNQASAANASESSSQSASSSVASDAAATVSKDSQAASEANSQSAADVETVQLPTSAANANANESQAANILGAQAVQKAANQQAPAGFTVTDPNYPAEMYKDPDASHYTYWWAQSSNGEYNLVLSTDRNGDGKVYVFLLGNNNNVLGKYTVDKNKSTEVATDDEGDFGTVYNDGQSGVFVTSDGTWKSKFNVFDPKAGEDDGDYGSISFMIPQVETQTTTYVTYFDSKGNKVDKPIEVSDPVIQKGLDGQIYTTKGGKVINGYFAKEPKNAHGFMSPFGKQGAIYTKDWHDGLKATFTETDTKTGLMHVVVKHYYHSWGWGTWRTVKEFDLAPGQSEKVDYDVYKSVTIHSIYIPQTINIQYTYEKLGNLVISSDSKSFPAEDKTQYPNDKSDSTKAGNVTIPKVAGFTPTINDKTVTNYTFNPSDYVSDLSKDINVVYVADTQEAAISFYDETDHKPLNDQTIQLTGKTGEKISHTEANQTLAKLGKQGYVVDQNTFADDATYDNDTQAPQEFTIYLKHDTTHTDATSSKADQKTVSETIHYVYKDGVNANKPVADDANTTVTFKRGYTTDKVTGKIVSYDPWTVDGKQADSKTFDAVKSPVIAGYTADQAEVAAQTVTPDSQNINKTVYYTADTQEAAINFYDETGHKLLDNQTIHLTGKTGEKVDRTQADQTLADLVKQGYVLDKENTAKAFPADAVYDNNDQTPQEFTIYLKHGTTHTDATSSKADQKTVSETIHYVYKDGVNANKPVADDANTTVTFKRGYTTDKVTGKIVSYDPWTVDGKQADSKTFDAVKSPVIAGYTADQAEVAAQTVTPDSQNINKTVYYTADTQEAAINFYDETGHKLLDNQTIHLTGKTGEKVDRTQADQTLAELEKQGYVLDENNTKLGFPSNAAYDDDDVKPQEFTIYLKHGMTHTDATDKNAEQKIVTETIHYVYENNQTAKTDYTSAVDFKRGYTTDNVTHKIISYDPWMVSSKKFGFVKSPAIEGYTPNHSQIDEITVTPDSKDVVKTVVYVGNAQEAQAIFYDETTGKEISGTREIATGKTDETISFTKDPNEVVKELEKQGYVFDKDNAKNNVFVAGTAYDKNSEVHQYFKYYLKHGHATVTPDQDPQKGQKTVTQTIKYEYADGTATGLADNVQTLTFKRTGDKDLVTHEVTWPDWSTVAGQQTSVVTSPALKGYTADTNEIPAITYHAGDSDVTYVVKYNADVQHAVINYIDGESDEILHTDKVNGHSDEKINYSTADMIKQLEAKGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKTDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQTAYVKYVDDTTGETLRQDDLHGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKTDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQTAYVKYVDDTTGETLRQDDLHGYTDETIPYSTAEGIKKYEGDGYVLVSDGFKPGTKFGVGTPTYEVHFKHGMTHTDATDKNAEQKTVTETIHYVDENNQTVQPDSTTAVTFKRGYTTDNVTGKVVSYDPWTVDGNQADSKTFAAVPSPAVEGYTPNHQQINEFTVTPDSKDIVKTVVYVGDPQEAQAIFYDETTGKEISNTREIVNGKTDETIGFTKDPNEVVKELEKQGYVFDKDNANNNVFAAGTTYDKNSEVHQYFKYYFTHATTIVTPDNPKTPADVLPDNPGKNYPSGVAKDDLNKTVTRTINITTPDGKTQTITQKAEFTRSATVDEVTGEVTYGPWSKNVVLESVDVPNISGYVPSASVPEITVTPNDQDMTINITYKKLDSGKAADQGGNASNGGQATNGGSTTGQSAQNGQSGQTQNNAGAQQLPQTGNANNEKGALGLASAMFAAGLGLGFGSKKKCHED', 'MSRKERNFKRFFGQEKARVKLYKSGKQWVKAGIREVQLLKVLGLPFLNKDVEQINNLDTNKDKNFKNQAMKATGLAGGAFTFAMLNDHHAYAASETPMTSEIASNSETVANQNSTTVTKSETSTTEYISSQTSTSQDATSSTNSTEKSTSSSTTDSQTSTDSTSDKSTSNSEKQDSSMSNSDTKASSSSTTDNSTSNNSTTSEKDTNSQANTTSTDSQKGSTSTNDNSITSTSTKDNQIRKNSTESNSITASNSTSDSNSGSTVSTNSTTSQLTSTSESQINTDLGSTLLVSDSTSTSTSTAPLKLRTFSRLATTTFAAAAATSTTNTYTGAGTDTNYNIPIYYKLTTVNNGTSMTFTYTVTYDNPATTTVERPTALSNSYAIYNTGTTNQTMFTLGSAYGTPSTATSYITDSTGAQVSNPRANTTNINKQGSGYTWANGYQMNGAQAKQGYGLTTTWTVPINSSGDTSFTFNPYSTSVTGGTNFFNGKKVTVTDPTSTANSQSASTSTANSQSASTSKSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSTSTSTSTSTANSQSTSTSTSTSVSDSTSASTSLSGSTSTSVSDSTSASTSLSDSASTSVSDSTSASTSLSASTSTSESDSTSASTSLSESTSTSLSDSLSASTSLSDSASTSVSDSTSASTSLSGSESASLSDSASASTSLSESTSTSESTSTSESDSTSASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSESTSTSLSDSASASTSLSESTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSESTSTSLSDSASASTSLSESTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSDSASTSTSVSDSTSASTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSESTSTSVSDSTSASTSLSDSASTSVSDSTSASTSLSESTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSASTSTSVSDSTSASTSLSGSTSTSESDSTSMSTSLSGSESTSLSDSLSASTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSTSISTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISRSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISGSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSESTSLSDSASASTSLSASTSTSVSDSTSTSTSDSVSTSTSMSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTNTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSESTSTSVSDSTSTSTSLSGSESTSLSDSASASTSLSASTSTSVSDSTSTSTSDSVSTSTSMSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISGSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSGSTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSSSESTSTSVSDSTSASVSTSISTSISMSESSSTSASTSDSTSTSASTSESRSASHSMSGTDSNNTSSSDSKSHSISNSDSNTTSDSASASTSISDSSSTSTSDSNASHSFSTSHSVSESNSMSTSHSQFDSISTSESMSGTDSTSLSTSLSHSASTSNSTSMTTSESQSNNDSQMHSNSLHHDAKDELPDTGDSDSNSTGLVSAVAAMLAGLGLFGKSRKNKKDKKNKGSEQ', 'MPSRSPSSARSSRALYAPRLKPLAQAIALLLVAGGAQAAGQPFSAAWFAAKGAAQGGAAGAPRPGAQLPGAPPPLAQQQRVNQQLQRSMANLNNTVAAIAAQQAAQAAGRQAALNLPQDVPDGLGEGGLKVDASLPFEQAWQNAKGPVQTQAAGKTTVSIEQTADKAVLNWETFNVGRNTTVDFQQHADWALLNRVNDPSARPSQIQGQIKADGTVMLVNRNGVVFSGSSQVDVRNLTVAAANISDEQFRQRGLYYDNAGSRPTFTDAAGAVRVEQGAQLRTAAPSGSTRGGGYVLLLGSEVDNAGSIVTPKGQTVLSAGDSFVIRRGQGTDGNLTSTTRGNEVLPGFAADSSAGRVRNSGLVQAATGDISLSGREVEQAGVLLSSSSVDSRGTLHLKASERITLAEGATSAILVDSSGSAALDSQREALLKPLNGSSAAVSRGDDDRRDLSRVEIDSAGSVDFRDGSITLASGGQVAVNAGQRALLRDGAVIDVSGAQGVQVAMETNSIKVNVRGNEQRDASVNREGGGLNSQDVWVDVRDLVRVPAGTNGYASDRWYTAGGLLEVGGYLGTQGHSAGEWMAQGGIVSFTGNDVVTQAGSQINLSGGTLDVQGGYIRQTWLKGSDGRLYELGSAPGDLLYDGIYRGYEAHSERWDQTRYFYNPLIAPTQRYENGYSVGRDAGSLVIGSANARLDGQVVGDTYRGERQTEAPQAGLDGYNQSQNAVARGAQLVVGRYTPYYVKSSGLLEYALGADAGSLKQVVIGTGEVAAEEPTLDAPVAAERQGRLSLDSELLNGFQLGGLKVAAGESIRVDSALTLANGGEAILFANDVAIDADITAHGGSLQAGNVLAQISPNGTIDGFVDAGREAGILRVGDGVRLDASGLWSNLLLAPEDNDTLAYRDGGRISLRSGGDLSLGQGSLLDVSSGAALLADGKRLGGRGGDIALHASAGLAQASDGQLQLGGTLNGLGTSGAGTLSLQSGKVRIGGDDLGDGSLQLAEDFFQQGFASYRVVGRSGLTVAEDAQVRVARPVYRFASGAGEVAAGEAPREALEAWIPPLYLEDALAGRLVQREGADLYLQAGGDGNILGQLDPASQTLELGRGSLVEVDPGRAIVLRGPGQITLDGILNAWGGRIDVRQQQFGALDVTQDNQPKAQGQPHARSIWIGEQALLDVAGRAVTALDGRGRRYGEVQSGGSIVIGGEIDPGKAIATSADAFVIVRPGARLEASGSQAQLDVPGLGRVLLAGDGGRIALSSYNGLYLDGSLRAAAGGSGAAGGSLEIIADAPLYQGFTVVDDRVLAMRELILTAGHADSGLPTLLQPGMDDSALRYGQSRVGTQSLTGGGFDQLSLFSNGPLSFEGNIDLAMGRSLNLYAGTIAATGGGPSEVKLQAPYVRLSGIGMYGQQASGEFRPRLTYGPTATAEQVRLQVSAGRLLDIAGRLSFGSDGVINGVNAEAVRYQRPGFEKVTLRSEGDLRFAGDYPENGDPSGRLITHGDLQLTAAQLYPVTGASSTLYAGYGLDEGGQAVFDAERHLAIERSGESLPDTPLSVFGSLAFMASNIEQGGVVRAPLGLIQFGSNLDRAPGTVRLLPGSLTSVSGAELVMPYGGTTDGINYLVNQVPIQLTGAGGALAAGTLVAGVGLYASEVDVQQGARLDLSGGGELAGAGFISGRGGSTDARFHPLVQQDNDGFRLPELSSNPVYAIVPGHQAVSAPLGGEAGAIQPLVGQQVTIGDGVPGLAAGTYTLLPSTYALLPGAFRVEINGLAGQGAPMATQGLRNGSWATSGQLSIAGTSIRDSLSRQVILSSADTLRRYSQYNEMSYADFIRADAARKNIPRAMLPVDARSLYLGLRADEELRENALSFEGKVDFTPEESGYGGSLIVDAEAGIEILPEGGLPDSDFAGVSLVADDLNAIGASRIAIGTLPYVEYGEQGNFVQFGGSNRLFPVVLRKGAHLSAPEVIIGRDITLEGGSGISTLGKGKTAYDSSDGFIYQPGGRNLLLLSNGWLNLLAPAADSSLPVRLGGCAEGAGCADTELYSEGTLGIATNGTVTFGDNVRYGTRNLSLALSTINIGSSQSLADAAARGVLPNGLALDQTVLQRLLRGERGAGIPALENLILSARDAVNIYGSVSLDTYDPATGKSSLANLVLGTPAIYGHGTGEDVASIRTASLVWSGSSQPAAAPVAGGAGSGSGTLRVDAERITLGYGANTQPAGETDEARLALGFAEVQLNASERISANHKGSLRVYQRLDGYVAGEGLRYSGGDLRLSTPLLTGEAGSLSRISSGGSLSLAAPAGAAAVTFDSGTAGLGAELSLSAREIRLDSAVSLPSGKLSLSAEDDLELGDGARIDLAGRKASFNDVDKYSWGGDLLLSSRAGDIRQAAGSLIDLSARNNRGGTLSAVALAEDAGVVDLQGRILGGASGDYDAGGTRVPFLGGELEIRAQRLGDGGSLSEQFTALNQRLNQGEVFGARRFQLKQGDLQIGDGLKAHRIEVSLDNGQLGVSGTVDASGAQVGEIRLAGGRGLSLGGNALLDAHGSLLRRDSYGQIIDSPNRAMVELSSGSGTLVLAGGARIDLRHGTAAPAEQVDGVARGTLELNAPRLGGVSAGDIAIDASGALDIRGAGSIALNAMQRYDDAPWGNDPAAGGRSYQVIDQAYLDARHAESSAFIAAALANRELLDGKLAGLTNATYADAFHLRPGVEIVSATADGDLVVQGDLDLSGYRYASLNPNTPLTEVYGSGEVGALVLRAGGDLNLYGSINDGFAPPPDSPDDKGWILTPGVQPFGGDLVVPGPGVVLGDGTAFLGGRTLNYDLPIKGTTLAAGTRLATEAVLEQPYTLAAGSVLVADIHDAAGTLLYAAGSLLRDGVTLEVGSRLGAGTLLAAPASVQAMTWPAGVPLPSILREGPSRPNVLLLNGELALARGSLIPSQTEVVLAGDAPFIELRPSDGVRQGRNWALAEMLPAGSQSWSMRLVAGADLAAADNRLVRPDSSASLNLADTHYQAKIEQSSGGLVFTDQATDWGITPGTPVDESNEWICGLGPYCAEPPRWTWAPGNYLGMPAGTAIGEGDLWWCSVDPSLCIENLGKTVVTPQNQLFSVLRTGTGDLDLASAGNLTQWSPYGVYTAGTQAADVATGFNQPRGLFNGSVLGAGGADYEVLSTSQYQAWYPEHGGNLDIAVGGDVVGDQWAEKLTSSDPIRPLPPSAAVGNWLWRQGSADREGVPTAWWVNFGSYVRGAEGDAPYLVGFTGFGTLGGGNLSMRTGGDAGNIAPRGDGSIPSSGNLNPRSQGLVLAVAGTGRLTSDGALQLGGGGDLNVRIGGEVNPSREARATQTYSSSGFDGLYSGGTIHDLQGALINLRGSASLYSGALGGIDPRYDTLLRDPAEVRSRDAFSPTLASSTGGLTLVAGDTGMRLETRGDLVLGGVTDPGRVGVPNTVGFTAPDGSVYQGGGIAWFSLWTAHTSIDLFAAGGNLTPSTQLVEATNAIPMAGRNLSPSDGRFIYPSIVRAAAPEGSIYLGPSSGDMGGVSLNVSTTPYSLLLAPSLNGELELLAGDSIYAGGYSVQRSGADPANLPSIWTPAFAGYSDAALLNPIAGNGSPDGNPAVIGGLPLFYFGPDSAASLARDLQPARFYALTGDIVGLNSGAQIRFGEQAGNRAGQTWYEGAGPVWMRAGRDIVASGTPLGQRISAPSQISTDASFTGNLFVHDDPNDLSLVQAGRDILYGNFNVAGPGTLEISAGRNILMEDRAAITSLGAVVPGDSRPGADIVLQAGAAGADYQAFLERYLDPANLAQAGTPLAEQPGKVVRTYESELAKWLNERFGFAGDAEQAQAFFAGLPAEQQRIFARQVYFAELRAGGREYNEVGGVRQGSYLRGRNAIAALFPERDPAGNPISYEGDIVMYGGAGVHTDFGGDIQLLSPGGRQVFGIEGEAPPSTAGIVTQGQGDIQAYSRDSILLGQSRIMTTFGGSILAWSAEGDINAGRGSQTTVVYTPPRRIYDAWGNVSLSPQVPSTGAGIATLNPIPEVAPGDIDLIAPLGTIDAGEAGIRVSGNVNVAALQVVNAANIQTQGQSSGIPLVASVNTGALTSASAAASSATQAAEDVSRQQQAAARQRMPSVITVQVLGFGNERLEPSRDGASRSPGYNPDSAVQVLGAGALGEQARSQLTDEERGNLIL', 'MDIRSPLNQCIALSLAGILFLNPIVAAAAGLALDKAAGGNTGLGQAGNGVPIVNIATPNDAGLSNNHFRDYNVGANGLILNNATGKTQGTQLGGIILGNPNLKGQAAQVILNQVTGGNRSTLAGYTEVAGQSARVIVANPHGITCQGCGFINTPRATLTTGKPIMDGQRLERFQVDGGDIVVEGAELNVGNLEQFDLITRSAKLNAKLYAKNLNIVTGRNDVQADSLQATPRAADGSEKPQLAIDSSALGGMYAGAIRLVGTEQGVGVRLAGDMAASGGDIRIDASGKLSLAQASSQGDLKIAAQAVELNGKTYAGGSAEIRSAEELVNRQSLAARERIVLEAAHIDNAGVIEAGVEPDERRNARGDLELRSGTLRNAGSLVASRALEAKASQALDNQGGSLKGATVRVDAGHLDNRGGKLLAEGELRVEASSLDNRQDGLLQSRDRAVVKTRGDLDNRGGQVIGLNDLEVGAATLDNGQQGLLGSQQSTRVSAQALVNRGDGEVSGKRVEARVGSLDNRGGKLIGDDLLVVASGAIDNRLGLFSAANRLDLRARSLDNSGKGTLSSRGGLEVSLGGLLDNRDEGNLLSQGAQRVTVGQLDNRAGGLLSSRSELNVHGASLDNRGGVLVADAGLSATGGAFDNRDGGSASGKAGVRVEVASLRNDQGGKLLSDGRLDLAANAVGNAGGRIAAKGDLQATLGSLAQQGGELVSEKTLKVAADTLDNSQSGLIAANGGIAIEARQVDNRAGEISSTSKVAVNAREQLDNRGGKVIGDSGLRLTVQRLLNQAKGVLAGRDGLSLDGGELFNGDGGRLDSQNSLSVSLGGVLDNQGGALVSEGSLTARAARLDNRGGTFSSAGALALTSQAALDNQGGRLLSDAGVTLQGASLDNSRSGVISAKGAVDIRTGVLDNSRNGGIGSNAGITLVAARLDNGQQGRVSAKGLLDANLKGLDQRGGGVLISETGVTLDLNGGTLVNRDGGLIATPGALLLRQLGAVDNGAGGEISSDRAFTLAAASLDNRGGRLIGAANLTLRIAQALDNSLAGVISGAAGLDIAAARLDNSAKGTLASRAGIDLRVDGALDNHAEGTVSGARLTLASASLDNSGKGLLSGNAGLSVATGALDNAEGGQLISQGVLDVSSADLDNRGGALSGKQSLRLSAANLDNRGGLLTSDGELELTAGRVDSADGGEISARGDLRLTVERLVQRQGRLVGERGVSLDLRGGDLDNQGGLISARGPLSIERLSVLDNRQGGEISSQQGFELLARRIDNGQQGRIISAGKLRLDADALGNAGAGLLSGWQGLTVTGGSLDNSAGGTLSSKDGELAISLGGALDNHGQGALVSKGAQRIDAASLDNAQGIVSGESDVTLSIAGKLDNGQGGLVSAQRALSFERDDTLLNNAGGRINGGSLLLKGASLDNSDGQLISQGRLDAILGGALVNTGAARLASGGDLLLRSASVDNRGGKLVSQGLLEISAGSLDNSASGTLASQAGMSLRLGGGALRNQQDGLIFSQAGALDVQAGSLDNRQGTLQAQGDNRLRIGGALDNQGGRLDSRAGNLDLQSGSLDNGAGGVLNSAKGWLKLVTGLFDNSAGVTQAQSLEIRAGQGVRNQQGHLSALGGDNRIVTADFDNQGGGLYASGLLSLDGQRFLNQGAAAGQGGKVGAGRIDFSLAGALANRFGQLESESELHLRAAAIDNSGGSLRALGRSGSTRLVAGGLNNAYGVLESANQDLDLQLGSLANAGGRILHTGNGTFGLDSGQVIRAGGELTTNGLLDIRASEWTNSSVLQAGRLNLDIGTFRQTAEGKLLAVQSFTGRGGDWSNDGLLASDGSFRLDLSGGYRGNGRATSLGDFALNAASLDLGNAASLAGGANVTLGAGNLLVNRGRITAAGDLVASAASLNNYGTLGGGGNLRLNAPALLNERGLLFSGADMTLRAGDITNLYGDVYSLGRLDIARDDAGNRAASLRNLSGVIESGKDFSLRASLIENRRAVLESKSGLYTAKMEQTACIEGVNAGDCSGKRNAIWTITQRDKTEVTASSAMGQLLAGGDFAIDGGTLNNLSSLIGSGGNLTANLEVLDNQGLETGELETIRVLRTARGGDIGGIDQKSRNFTNLYWYQSANFDPARAGEIPAALNAILSDWSFEYEFPSKGPTPISSGDQSYAAVIQAAGDVTVNASTRIDNGVTRPGYTFVGSGRQVGDSAVGGSGVSVVVPLTSQLPPDLARRQVNPVTLPGFSLPQGDNGLFRLSSRFAEDGNGSAALGAGADRTQGGSGVSVGQQGAGNAAGTWQGQGVRVDGLAGAANVQGQGGSTLGGSLPGVARVQGVPGNATPSASHKYLIETNPALTELKQFLNSDYLLSGLGMNPDDSKKRLGDGLYEQRLIRDAVVARTGQRYIDGLSSDEALFRYLMDNAIAYKDQLHLQLGVGLSAEQMAALTHDIVWLEEVEVNGEKVLAPVVYLAQAEGRLAPNGALIQGRDVKLVSGGDLHNVGTLRARNDLSATADNLDNSGLIEAGKRLDLLAGDSIRNRQGGVIAGRDVSLTALTGDVINERSVTRYDSALDGRTWERSFADSAARVEAANSLNVQAGRDIANLGGVLQSRGDLSLDAGRDVTVAAVEDRQGQTRWSTSRLQSVTQLGAEVSAGRDLNVSAGRDLTAVASTLEARRDIALSAGRDVTLAAAANEEHAYSKTRKVTYQEDKVAQQGTRVDAGGDLAINAGQDLRLIASQASAGDEAYLVAGDKLELLAANDSNYYLYDKKKKGDFGRKETRRDEVTDVKAVGSQISSGGDLTLLSGGDQTYQGAKLESGNDLAIVSGGAVTFEAVKDLHQESHEKSKGDLAWNSAKGKGQTDETLRQTQIVAQGNLAIKAVEGLKIDLKHIDQKTVSQTIDAMVQADPQLAWLKEAEQRGDVDWRMVQEVHDSWKYSNSGMGPATQIAVAIAAAAIGGMAAAGALSGAGVGASSFAMGAGVGAAGSLSGTAAVSLINNKGDLGKVLKDSFSSDSLKQIAIASLTGGLTAEYFDGILQTKTDPLTGKVTVDLSSLSGVGRFAANQAMQNATSTVLSQALGQGGSLNEALKSALYNSFAAAGFNFVGDIGQEYSLKPGDPSMVTMHALMGGLAAQVSGGDFATGAAAAGANEALVAKLDQAFKSLSPENREAMVTMGSQLVGVLAAAVRDPDVTGKALESAAWVAKNSTQYNFLNHQDVADLDNALQKCKSQGNCRQVEEEFKARSDENRRRLNGCVAVGNCAEIRAEIDAGSTALNELVARQETANPGGSDSDIAYGFLMGRNVVDWTTAGQLHLEQTANLWWNGNPQWQKEVGAYLDQTGFNPFGIGVPAMGGAAGKVTAKALMNALKAGELPKGEVAPGKANLPTIGALADAEAGMPYTHPVKLAAKATGTAGKIKIEAGAIPDANEVRAGQGLSGLGYDVTHQTTASAKGIQGQRTADLHVDGLGSIDVYTPKNLDPTKIVRAIEKKSNQAGGVLVQADLPSTDMSSIAARMWGKTNAQSIKTIFFQKPDGSLVRFDRPAGGG', 'MDIRSPLNQCIALSLAGILFLNPIVAAAAGLALDKAAGGNTGLGQAGNGVPIVNIATPNGAGLSNNHFRDYNVGANGLILNNATGKTQGTQLGGIILGNPNLKGQAAQVILNQVTGGNRSTLAGYTEVAGQSARVIVANPHGITCQGCGFINTPRATLTTGKPIMDGQRLERFQVDGGDIVVEGAELNVGNLEQFDLITRSAKLNAKLYAKNLNIVTGRNDVQADSLQATPRAADGSEKPQLAIDSSALGGMYAGAIRLVGTEQGVGVKLAGDMAASGGDIRIDASGKLSLAQASSQGDLKIAAQAVELNGKTYAGGSAEIRSAEELVNRQSLAARERIALEAAHIDNAGVIEAGVEPDERRNARGDLELRSGTLRNAGSLVASRALEAKASQALDNQGGSLKGATVRVDGGHLDNRGGKLLAEGELRVEASSLDNRQDGLLQSRDRAVVKTRGDLDNRGGQVVGLNELQVQAAALDNRSAGLLSSKGDMDIEFARLDNSAGGKLVSERRTLLKADRLDNRSGRIVAGQDLDLSSRLIDNRAGDISSTSRVVASAREQLDNRGGKIVGDSGLDITTPRMLNQDKGVLASRDGLRLSATELFNGAGGLLSSQKGIDVSLAGAFDNQAGSLDSRGFLTVKSAWLDNQGGTLSSAGALAVTSQGALNNQGGRLASDAGLSLSSASLDNSQAGAISGKGAVEIRTGNLNNSRKASIGSDAGLTLVAARVDNSQAGRIAAKGVIDADLQGLDQHDRGNLVSDTGITLDLNKGSLVNRAQGLIATPGTLLLRQLGVVDNSGGEISSDRAFTLATSALNNQGGRLLSGGALTLRIAQALDNSLEGIVSGAGGLDIQAFVLDNRSGSIGSKGAIDIGVTRLENDAGTLIAERGLKLVADEANSSKGRIAANGSLHAKVGTLSQKGGELTSQDSLTLDLGILNNNAGRIAGNQGVDITARQVDNSVGEIASQGVVALNLTEQLDNRGGKIVGDSGLGITAPHVLNQDKGVLASRDGLRLSATELFNGAGGLLSSQKGIDVSLAGAFDNQAGSLDSRGFLTVKSAWLDNQGGTLSSAGALAVTSQGALNNQGGRLASDAGLSLSSASLDNSQAGAISGKGAVEIRTGNLNNSRKASIGSDAGLTLVAARVDNSQAGRIAAKGAIDAALQGLDQHDRGSLVSDTGITLDLNKGSLVNRAQGLIATPGTLLLRQLGVVDNSGGEISSDRAFTLATSALNNQGGRLLSGGALTLRIAQALDNSLEGIVSGAGGLDIQAFVLDNRSGSIGSKGAIDIGVTRLENDAGTLIAERGLKLAADEANNSKGRIVAKDELRAKLGALVQNGGELTTQGALALDADKVDNGAGRIAGNRGVVIDARQVDNRAGEIASQGVATLNLTEQLDNRGGKVVADSGLGITAPRVLNQDKGVIASRDGLRLSGTELFNGNAGLLSSQRHIEVTLDGVLDNQGKGALLSDGTLTVSAGRIHNQDATLSSAGALRLSSQEAVDNRGGKLVTDSSLRLTSASLDNSRSGIISANAAAEIHTGVLNNSQKGNLGSNDGLGLIATEVDNSQEGRITAKGMIDANIKGLDQQGKGRLVSNAGIILDLNEGTLANGAQGLIATPGTLLLRQLGMVDNSGGEISSDRAFTLTTSALTNQGGRLRSGGVLTLRIAQALDNSLEGVLSGTGGLDIRALALDNRSGSIGSKGAVDIDVSRLENDDGDLLSEGRLKLTAERANSVRGRIAARGDLHASVTAFNQAGGELSSEGALMLEADSLDNRSGGLVSADGNLTVSARRIDNRAGEIASPGQVTLDVAEQLDNRGGKAIGDSGLRLAAPRVLNQDGGVLASRDGLRLNGAELFNGNGGLLSSQQSIDVILDGVLGNQAGSLSSQGRLSVKSGRLDNQGGAVSSAGTLSLSSQGALNNQGGRVVTDAGAVLRSASLDNSQGGIVSAKGAAEIRTGSLNNSQKGGIGSGAGLALVADLVDNSQNGRITAKGAIDANLKGLDQQGSGRLVSDTAIALDLRGGELVNRAQGLIATPGALLLRQLGVVDNSGGGEISSDRSFTLAATALSNRGGRVISGDSLTLRIAQALDNSLQGVLSASGGLDVAALVFDNHSGIVASKGDTHIGVNRLENEAGRVVSEGALDLTAKQVSSAKGRIAAKGDLQVTVGTLEQQGGELASQGTLTLDADSLDNRNGGLVSADGGVTAEARQIDNRGGEISSVAKVALAVREQLDNRGGKVIGDSELSLTVQRLLNQAKGVLASRDGLHLDGAELLNGDGGLLSSQRLVDVTLSGALDNQGSGALVSEESLTVKADQVNNQAGTFSSAGSLLVTSRGELNNQGGRLVTDAGATLNSTGFDNSRAGLVSAKGAVAIRTGALNNSQKGSIGGNTGVTLVAGLVDNGREGRISTKGTLDANLKGLLQQGGGSLVGERGVTLDLNGGTLDNHDLGLVSTPGALLLRQLGMVDNSVGGEISSDRAFTLAANTLNNQGGRLISSEALTLRIAKTLDNSLKGQVLATDGLAIESQVLDNRAGTIGSKGDARISVTSLDNAEQGSLVSEGRLELVADQVSNGNQGRIAARGVLEAAVGTLLQQGGELVSQGSLDLRADTLDNSQSGLIAANGGIAIEARQVDNRAGEISSTSKVAVNAREQLDNRGGKVIGDSGLRLTVQRLLNQAKGVLAGRDGLSLDGGELFNGDGGRLDSQNSLSVSLGGVLDNQGGALVSEGSLTARAARLDNRGGTFSSAGALALTSQAVLDNQGGRLLSDAGVTLKGASLDNSRSGVISAKGAVDIRTGVLDNSRNGGIGSNAGITLVAARLDNGQQGRVSAKGLLDANLKGLDQRGGGVLVSETGVTLDLNGGTLVNRDGGLIATPGALLLRQLGAVDNGAGGEISSDRAFTLAAASLDNRGGRLIGADSLTLRIAQALDNSLAGVISGAAGLDIAAARLDNSAKGTLASRAGIDLRVDGALDNHAEGTVSGARLTLASASLDNSGKGLLSGNAGLSVATGALDNAEGGQLISQGVLDVSSADLDNRGGALSGKQSLRLSAANLDNRGGLLTSDGELELTAGRVDSADGGEISARGDLRLTVERLVQRQGRLIGERGVSLDLRGGDLDNQGGLISARGPLSIERLNVLDNRQGGEIYSQQGFELLARRIDNGQQGRIISAGKLRLDADALGNAGAGLLSGWQGLTVTGGSLDNSAGGTLSSKDGELAISLGGALDNHGQGALVSKGAQRIDAASLDNAQGIVSGESDVTLSIAGKLDNGQGGLVSAQRALSFERDDTLLNNAGGRINGGSLLLKGASLDNSDGQLISQGRLDAILGGALVNAGAARLASGGDLLLRSASVDNRGGKLVSQGLLEISAGSLDNSASGTLASQADMSLRLGGGALRNQQDGLIFSQAGALEVQAGSLDNRQGTLQAQGDNRLRIGGALDNQAGRLDSRAGNLDLQSGSLDNGAGGVLNSAKGWLKLVTGLFDNSAGVTQAQSLEIRAGQGVRNQQGHLSALGGDNRIVTADFDNQGGGLYASGLLSLDGQRFLNQGAAAGQGGKVGAGRIDFSLAGALANRFGQLESESELHLRAAAIDNSGGSLRALGRSGSTRLVAGDLNNAYGVLESANQDLDLQLGSLANAGGRILHTGNGTFGLDSGQVIRAGGELTTNGLLDIRASEWTNSSVLQAGRLNLDIGTFRQTAEGKLLAVQSFTGRGGDWSNDGLLASNGSLRLELSGGYRGNGRATSLGDFALNAASLDLGNAASLAGGANVTLGAGNLLVNRGRITAAGDLVASAASLNNYGTLGGGGNLRLNAPALLNERGLLFSGADMTLRAGDITNLYGDVYSLGRLDIARDDAGGWANRLENISGNLESTGDMRFSVSSLLNRRETLEIEGDLQNSAIGVRCTGCQLSERWGKTRSSSELVWIREYKSTLGDSSAAASITAGRDLLVVGASLQNIASNISAVRDATLSLSNFENKGYALGEYAVRGVYSPPSKFGEELLMRILAYNAVNDPSYGEGYASTGGRLPNIHYFDKNFNEKVSPLEVIHGNGKNGGPGWHLYFGTLDVEYPDTDRWNKAIGRIPAPNYSSKKTDAIPDLLKGLAPLDELTINKGANSTVGAVVQAGGRVTVNAAESFNNSVLQGFQAVQETQLPHQDIAVSSTTSAVVTLKSQLPADLARQQINPLTLPGFSLPQGQNGLFRLASQGAQVNQASGALKSASDLTQSGHGVSVSAQTGSGASGWSTQARRVGDDRVTSLAGSAYQGRVAEAIDALRASAPISGDGGNTGRFQAGEHQATTGLGGLVEGNASGHSGNGVILADLRGGLPSFSSLPASDHVQGTVPGHDGNGTILANWQGAQATVQASPSTVRVEGVVSSPGGNGSILADLPAEQSSVQALPSAVRAQGSLPRLEERSALLAEPPVGQPALQTLPSVARVEGVPSNATPSNSHKYLIETNPALTELKQFLNSDYLLGGLGINPDDSKKRLGDGLYEQRLVREAIVQRTGQRFIAGLNSDEAMFRYLMDNAIASKDVLGLTPGVTLSAAQVAALTHDIVWLEEVEVNGEKVLAPVVYLAQAEGRLGPNGALIQGRDVNLITGGDLRNAGTLRAQNDLSATAGNIDNSGLIEAGNRLDLLASGSIRNDQGGIIAGREVSLSALTGDVINERTVTQHQSSYRGTGTTEAFADSAARIEAAQKLTVSAGRDVANIGGVIDSKGDLALQGGRDVLVSAAVAERGWTAGSQAYQTQTTQMGAEVVAGRDISVSAGRDISVVGSRIDARRDVTFEAGRDVGLVAAANEEHAYGKTKKVTFQDDKITQQATRVDAGGDLAINAGQDLRLVASQASAGDEAYLVAGDKLELLAANDSSYYLYDKKSKGSFGSKKTRRDEITDVTAVGSQISSGGDLTLLSGGDQTYQGAKLESGNDLAIVSGGAVTFEAVKDLHQESHEKSKGDLAWQSSKGKGQTDETVRQSQIVAQGNLAIKAVEGLKIDLKHIDQKTVSQTIDAMVQADPQLAWLKQMEQRGDVDWRRVQELHDSWKYSNSGLGVGAQLAIAIVVAYFTAGAASAALGSMAGVGAGSGSMMAAAGSTAMVQAGTAVGTAAAGWANAAGTAVAMGMASNGAISTINNRGNLGDVVKDVTSSDALRGYVVAGTTAGLTAGVYDKWTSTQTGTSTALPNTGAVAPAAGLGTWQGVGQFTSNQLLQNGTSVLLDRALGGKGSLGDALQNSLANAFAAYGFKLIGDTTHGVLDDGSLGKIGLHALMGGLAAEAVGGDFRTGALAAGVNEALVDSLAKQYASLPIDDKKGLLIMSSQLIGVLAASTQGDADAKSLQTGAWVAGNATQHNYLSHWQEEKKRQEVDGCKDKQLCKTGIEAKWAIISAQQDVGIVVGVGGGIGLSTAETAVGVYELVKNWRETYAALEQLATSPEFRQQFGDNYLKGLEERAAFLTQAYEDAGWQGSVTAGVEGGRFAAELVGVLTAVKGGAQITAKLPTAAKNLVNAIAESPVSGSMSSQLGAVGDLGRLGGGGKGYVDILSHEAKQHILYGDKPGSGGHLWPGQAGKTVFPQNWSADKIVHEVGDIATSPSTKWYAQTGTGGVYTSKGDPAKWVAYEVRDGVRMRVVYQPATGKVITAFPDNAPIPPYKPIK']\n",
421
+ "['CAC14227', 'P12255', 'P20471', 'A64556', 'AAF25576', 'Q4L9P0', 'Q9I5N6', 'Q9I791', 'Q9I120']\n"
422
+ ]
423
+ }
424
+ ],
425
+ "source": [
426
+ "sequences = list(cpu_sequences['sequence'])\n",
427
+ "print(sequences)\n",
428
+ "accession = list(cpu_sequences['id'])\n",
429
+ "print(accession)"
430
+ ]
431
+ },
432
+ {
433
+ "cell_type": "code",
434
+ "execution_count": 8,
435
+ "id": "2a1832cb",
436
+ "metadata": {},
437
+ "outputs": [
438
+ {
439
+ "data": {
440
+ "application/vnd.jupyter.widget-view+json": {
441
+ "model_id": "5df74f5eb4e24f72b645d0bbc1dc5c36",
442
+ "version_major": 2,
443
+ "version_minor": 0
444
+ },
445
+ "text/plain": [
446
+ "Processing Sequences: 0%| | 0/9 [00:00<?, ?it/s]"
447
+ ]
448
+ },
449
+ "metadata": {},
450
+ "output_type": "display_data"
451
+ }
452
+ ],
453
+ "source": [
454
+ "# Setup device\n",
455
+ "device = torch.device('cpu')\n",
456
+ "\n",
457
+ "# Load tokenizer and model\n",
458
+ "tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)\n",
459
+ "model = T5EncoderModel.from_pretrained(\"Rostlab/ProstT5\").to(device)\n",
460
+ "model.full() if device == 'cpu' else model.half()\n",
461
+ "\n",
462
+ "# Clean sequences\n",
463
+ "sequences = [\" \".join(list(re.sub(r\"[UZOB]\", \"X\", s))) for s in sequences]\n",
464
+ "sequences = [ \"<AA2fold> \" + s for s in sequences]\n",
465
+ "\n",
466
+ "# Process each sequence individually\n",
467
+ "for i, (seq, acc_id) in enumerate(tqdm(zip(sequences, accession), total=len(sequences), desc=\"Processing Sequences\")):\n",
468
+ " try:\n",
469
+ " # Tokenize\n",
470
+ " ids = tokenizer(\n",
471
+ " seq,\n",
472
+ " add_special_tokens=True,\n",
473
+ " return_tensors='pt'\n",
474
+ " ).to(device)\n",
475
+ "\n",
476
+ " # Forward pass\n",
477
+ " with torch.no_grad():\n",
478
+ " embedding_repr = model(\n",
479
+ " ids.input_ids,\n",
480
+ " attention_mask=ids.attention_mask\n",
481
+ " )\n",
482
+ "\n",
483
+ " # Compute actual length (excluding prefix)\n",
484
+ " real_len = ids.attention_mask[0].sum().item() - 1\n",
485
+ "\n",
486
+ " # Extract and average embeddings\n",
487
+ " emb = embedding_repr.last_hidden_state[0, 1:real_len]\n",
488
+ " emb_avg = emb.mean(dim=0).cpu().numpy()\n",
489
+ "\n",
490
+ " # Save embedding using accession ID\n",
491
+ " np.save(os.path.join(path, f\"{acc_id}.npy\"), emb_avg)\n",
492
+ "\n",
493
+ "\n",
494
+ " # Cleanup\n",
495
+ " del ids, embedding_repr, emb, emb_avg\n",
496
+ " torch.cuda.empty_cache()\n",
497
+ " gc.collect()\n",
498
+ "\n",
499
+ " except RuntimeError as e:\n",
500
+ " print(f\"Error {e} mientras se procesaba {acc_id}\")\n",
501
+ "\n"
502
+ ]
503
+ }
504
+ ],
505
+ "metadata": {
506
+ "kernelspec": {
507
+ "display_name": "tesisEnv",
508
+ "language": "python",
509
+ "name": "python3"
510
+ },
511
+ "language_info": {
512
+ "codemirror_mode": {
513
+ "name": "ipython",
514
+ "version": 3
515
+ },
516
+ "file_extension": ".py",
517
+ "mimetype": "text/x-python",
518
+ "name": "python",
519
+ "nbconvert_exporter": "python",
520
+ "pygments_lexer": "ipython3",
521
+ "version": "3.10.16"
522
+ }
523
+ },
524
+ "nbformat": 4,
525
+ "nbformat_minor": 5
526
+ }
notebooks/__pycache__/my_utils.cpython-310.pyc ADDED
Binary file (14.2 kB). View file
 
notebooks/hyperparamsRF.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/my_utils.py ADDED
@@ -0,0 +1,607 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Libraries
2
+ import os
3
+ import re
4
+ from pprint import pprint
5
+ from io import StringIO
6
+ from concurrent.futures import ThreadPoolExecutor, as_completed
7
+ from urllib.error import HTTPError
8
+ from typing import Literal
9
+
10
+
11
+ import pandas as pd
12
+ import numpy as np
13
+
14
+ from sklearn.ensemble import RandomForestClassifier
15
+ from sklearn import svm
16
+ from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
17
+ from sklearn.metrics import classification_report, accuracy_score, f1_score, recall_score, precision_score, confusion_matrix
18
+ from sklearn.decomposition import PCA
19
+ from sklearn.preprocessing import StandardScaler
20
+ from sklearn.pipeline import Pipeline
21
+ from sklearn.manifold import TSNE
22
+ from sklearn.model_selection import train_test_split
23
+ from sklearn.utils import resample
24
+
25
+ import umap
26
+
27
+ import requests
28
+ from Bio import Entrez
29
+ from Bio import SeqIO
30
+ from tqdm.notebook import tqdm
31
+
32
+ # Visualization libraries
33
+ import seaborn as sns
34
+ import matplotlib.pyplot as plt
35
+ import plotly.express as px
36
+
37
+ from esm.models.esmc import ESMC
38
+ from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
39
+ from transformers import T5Tokenizer, T5EncoderModel
40
+
41
+ import torch
42
+ import gc
43
+
44
+
45
+
46
+ # Load one chunk of embeddings
47
+ def load_emb(path: str, acc: list[str])->list[np.array]:
48
+ X = []
49
+ for a in tqdm(acc, desc = 'Cargando embeddings'):
50
+ emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
51
+ if len(emb.shape) == 3:
52
+ emb = emb.squeeze(axis = 0)
53
+ emb = emb.mean(axis = 0)
54
+ X.append(emb)
55
+ elif len(emb.shape) == 2:
56
+ emb = emb.mean(axis = 0)
57
+ X.append(emb)
58
+ else:
59
+ X.append(emb)
60
+ return X
61
+
62
+ def confusion(title : str, y_true: np.array, y_pred: np.array) -> None:
63
+
64
+ cm = confusion_matrix(y_true = y_true,
65
+ y_pred = y_pred,
66
+ normalize = 'pred')
67
+
68
+ class_names = np.unique(y_true)
69
+ plt.figure(figsize=(6, 4))
70
+ sns.heatmap(cm, annot=True, fmt='.2f', cmap='Blues',
71
+ xticklabels=class_names, yticklabels=class_names)
72
+
73
+ plt.xlabel('Predicted Label')
74
+ plt.ylabel('True Label')
75
+ plt.title(f'Confusion Matrix - {title}')
76
+ plt.tight_layout()
77
+ plt.show()
78
+
79
+ def perplexity(X):
80
+ X_array = np.vstack(X)
81
+ perp= np.arange(5, 55, 5)
82
+ divergence = []
83
+
84
+ for i in perp:
85
+ model = TSNE(n_components=2, init="pca", perplexity=i)
86
+ reduced = model.fit_transform(X_array)
87
+ divergence.append(model.kl_divergence_)
88
+ fig = px.line(x=perp, y=divergence, markers=True)
89
+ fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
90
+ fig.update_traces(line_color="red", line_width=1)
91
+ fig.show()
92
+
93
+ def plot_umap(X: list[np.array], y: list[str], title: str, org : list[str]) -> None:
94
+ reducer = umap.UMAP(n_neighbors=30, random_state=42)
95
+ X_array = np.vstack(X)
96
+
97
+ scaled_X = StandardScaler().fit_transform(X_array)
98
+ embedding = reducer.fit_transform(scaled_X)
99
+
100
+ fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=y, hover_data= [org, y])
101
+ fig.update_layout(
102
+ title=title,
103
+ xaxis_title="First UMAP",
104
+ yaxis_title="Second UMAP",
105
+ )
106
+ fig.show()
107
+
108
+
109
+ def plot_PCA(X: np.array, labels: list[str], title: str, org : list[str], scale: bool) -> None:
110
+ X_array = np.vstack(X)
111
+ pca = PCA(n_components=2, random_state=42)
112
+
113
+ if scale:
114
+ pipe = Pipeline([('scaler', StandardScaler()), ('pca', pca)])
115
+ Xt = pipe.fit_transform(X_array)
116
+ explained = pipe.named_steps['pca'].explained_variance_ratio_
117
+ else:
118
+ Xt = pca.fit_transform(X_array)
119
+ explained = pca.explained_variance_ratio_
120
+
121
+ df_plot = pd.DataFrame({
122
+ 'PC1': Xt[:, 0],
123
+ 'PC2': Xt[:, 1],
124
+ 'Label': labels
125
+ })
126
+
127
+ fig = px.scatter(df_plot, x='PC1', y='PC2', color='Label', hover_data= [org, labels])
128
+ fig.update_layout(
129
+ title=title,
130
+ xaxis_title=f'PC1 ({explained[0]*100:.1f}%)',
131
+ yaxis_title=f'PC2 ({explained[1]*100:.1f}%)'
132
+ )
133
+ fig.show()
134
+
135
+
136
+ def tsne_plot(X, y, org : list[str]) -> None:
137
+ X_array = np.vstack(StandardScaler().fit_transform(X))
138
+ tsne = TSNE(n_components=2, perplexity=60, random_state=42)
139
+ tsne_fit = tsne.fit_transform(X_array)
140
+
141
+ fig = px.scatter(x=tsne_fit[:, 0], y=tsne_fit[:, 1], color=y, hover_data= [org, y])
142
+ fig.update_layout(
143
+ title="t-SNE",
144
+ xaxis_title="First t-SNE",
145
+ yaxis_title="Second t-SNE",
146
+ )
147
+
148
+ fig.show()
149
+
150
+
151
+ def plot_emb(X, y, model_name, org : list[str]):
152
+ print(f"Plotting embeddings for: {model_name}")
153
+ plot_PCA(X, y, title="PCA", scale=True, org = org)
154
+ tsne_plot(X, y,org = org)
155
+ plot_umap(X, y, title="UMAP",org = org)
156
+
157
+
158
+ def evaluate(model, X_test, y_test):
159
+
160
+ result = {}
161
+ y_pred = model.predict(X_test)
162
+
163
+ result['Accuracy'] = accuracy_score(y_test, y_pred)
164
+ result['Recall'] = recall_score(y_test, y_pred, average = 'weighted')
165
+ result['Precision'] = precision_score(y_test, y_pred, average='weighted')
166
+ result['F1'] = f1_score(y_test, y_pred, average='weighted')
167
+
168
+ pprint(result)
169
+
170
+
171
+ return result
172
+
173
+
174
+
175
+ def train_rf(title : str, X : np.ndarray, y : np.ndarray, params: dict) -> tuple[RandomForestClassifier, dict]:
176
+
177
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
178
+
179
+ # Initialize the RandomForestClassifier with specified parameters
180
+ classifier: RandomForestClassifier = RandomForestClassifier(**params)
181
+
182
+ # Fit the model on training data
183
+ classifier.fit(X_train, y_train)
184
+
185
+ # Make predictions on the test data
186
+ y_pred = classifier.predict(X_test)
187
+
188
+ evaluation = evaluate(classifier, X_test, y_test)
189
+
190
+ print(classification_report(y_test, y_pred, zero_division=0))
191
+
192
+ confusion(title = title,
193
+ y_true = y_test,
194
+ y_pred = y_pred)
195
+
196
+ del X_train, X_test, y_train, y_test
197
+
198
+ return classifier, evaluation
199
+
200
+ def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[Pipeline, dict]:
201
+
202
+ X_train, X_test, y_train, y_test = train_test_split(
203
+ X, y, test_size=0.33, stratify=y, random_state=42
204
+ )
205
+
206
+ svc_params = {k.replace('svm__', ''): v for k, v in params.items() if k.startswith('svm__')}
207
+ pipeline = Pipeline([
208
+ ('scaler', StandardScaler()),
209
+ ('svm', svm.SVC(**svc_params))
210
+ ])
211
+
212
+
213
+ pipeline.fit(X_train, y_train)
214
+
215
+ y_pred = pipeline.predict(X_test)
216
+
217
+ evaluation = evaluate(model=pipeline, X_test=X_test, y_test=y_test)
218
+
219
+ confusion(title = title,
220
+ y_true = y_test,
221
+ y_pred = y_pred)
222
+
223
+ print(classification_report(y_test, y_pred, zero_division=0))
224
+
225
+ return pipeline, evaluation
226
+
227
+
228
+ def randomSVM(X: np.array, y = np.array) -> dict:
229
+
230
+ X_train, _, y_train, _ = train_test_split(X,
231
+ y,
232
+ test_size=0.33,
233
+ stratify=y,
234
+ random_state=42)
235
+
236
+ X_sample, y_sample = resample(X_train,
237
+ y_train,
238
+ n_samples = 3500,
239
+ stratify = y_train,
240
+ random_state = 42)
241
+
242
+ pipeline = Pipeline([('scaler', StandardScaler()),
243
+ ('svm', svm.SVC())])
244
+
245
+ param_distributions = {
246
+ 'svm__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
247
+ 'svm__kernel': ['rbf'],
248
+ 'svm__gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
249
+ 'svm__shrinking': [True, False],
250
+ 'svm__class_weight': ['balanced'],
251
+ 'svm__tol': [1e-5, 1e-4, 1e-3, 1e-2],
252
+ 'svm__max_iter': [-1, 1000, 5000, 10000],
253
+ 'svm__probability': [False, True],
254
+ 'svm__decision_function_shape': ['ovr', 'ovo'],
255
+ 'svm__cache_size': [200, 400, 600]
256
+ }
257
+
258
+ random_search = RandomizedSearchCV(
259
+ estimator=pipeline,
260
+ param_distributions=param_distributions,
261
+ n_iter=50,
262
+ scoring='f1_weighted',
263
+ cv=3,
264
+ verbose=2,
265
+ random_state=42,
266
+ n_jobs=-1
267
+ )
268
+
269
+ random_search.fit(X_sample, y_sample)
270
+
271
+ pprint(random_search.best_params_)
272
+
273
+ return random_search.best_params_
274
+
275
+ def randomSearch(X: np.ndarray, y: np.ndarray) -> dict:
276
+
277
+ X_train, _, y_train, _ = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
278
+ classifier : RandomForestClassifier = RandomForestClassifier(random_state=42)
279
+
280
+ X_sample, y_sample = resample(X_train,
281
+ y_train,
282
+ n_samples = 3500,
283
+ stratify = y_train,
284
+ random_state = 42)
285
+
286
+ param_grid = {
287
+ 'n_estimators': list(np.arange(500,4000, 400)),
288
+ 'max_depth': [None, 10, 20, 30, 40, 50],
289
+ 'min_samples_split': [2, 5, 10, 15, 20],
290
+ 'min_samples_leaf': [1, 2, 4, 8, 10],
291
+ 'max_features': ['sqrt', 'log2', None, 0.3, 0.5, 0.7],
292
+ 'bootstrap': [True, False],
293
+ 'criterion': ['gini', 'entropy'],
294
+ 'max_leaf_nodes': [None, 10, 50, 100, 200],
295
+ 'class_weight' : ['balanced']
296
+ }
297
+
298
+ rf_random = RandomizedSearchCV(estimator = classifier,
299
+ param_distributions = param_grid,
300
+ n_iter= 50,
301
+ scoring = 'f1_weighted',
302
+ cv = 3,
303
+ verbose = 2,
304
+ n_jobs = -1)
305
+
306
+ rf_random.fit(X = X_sample, y = y_sample)
307
+
308
+ print('Best Params')
309
+ pprint(rf_random.best_params_)
310
+
311
+ return rf_random.best_params_
312
+
313
+ def gridSearch(X: np.ndarray, y: np.ndarray, grid: dict):
314
+
315
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, stratify=y, random_state=42)
316
+
317
+ # Initialize GridSearchCV with the base model and hyperparameters
318
+ grid_search: GridSearchCV = GridSearchCV(
319
+ estimator=RandomForestClassifier(random_state=42),
320
+ param_grid=grid,
321
+ cv=1,
322
+ scoring = 'f1_weighted',
323
+ verbose = 1,
324
+ pre_dispatch = 5,
325
+ n_jobs=-1
326
+ )
327
+
328
+ grid_search.fit(X = X_train, y = y_train)
329
+
330
+ print('Best Estimator')
331
+ pprint(grid_search.best_estimator_)
332
+
333
+ evaluation = evaluate(grid_search, X_test, y_test)
334
+
335
+ return grid_search, evaluation
336
+
337
+
338
+ def fetch_uniprot_sequence(uniprot_id: str):
339
+ url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
340
+ response = requests.get(url)
341
+
342
+ if response.status_code == 200:
343
+ try:
344
+ # Use SeqIO.read to get a single record
345
+ fasta_io = StringIO(response.text)
346
+ record = SeqIO.read(fasta_io, "fasta")
347
+ return str(record.seq)
348
+
349
+ except Exception:
350
+ # fallback to UniSave if the standard endpoint is not available
351
+ url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
352
+ response = requests.get(url)
353
+
354
+ if response.status_code == 200:
355
+ try:
356
+ # If UniSave returns multiple entries, grab the first
357
+ entries = re.split(r"(?=>)", response.text.strip())
358
+ fasta_io = StringIO(entries[1])
359
+ record = SeqIO.read(fasta_io, "fasta")
360
+ return str(record.seq)
361
+ except Exception:
362
+ print(f'No se pudo obtener la entrada FASTA para {uniprot_id} desde UniSave')
363
+ else:
364
+ print(f'UniSave URL inválido: {url}')
365
+ else:
366
+ print(f'URL inválido o no accesible: {url}')
367
+
368
+ def fetch_refseq_sequence(refseq_id : str):
369
+ """
370
+ Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
371
+ Returns the raw amino-acid sequence as a string.
372
+ """
373
+
374
+ Entrez.email = "puglia.jd@gmail.com" # REQUIRED
375
+ Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
376
+ # Check if the ID is NaN or None
377
+ if pd.isna(refseq_id) or refseq_id is None:
378
+ return None
379
+
380
+ try:
381
+ handle = Entrez.efetch(
382
+ db="protein",
383
+ id=refseq_id,
384
+ rettype="fasta",
385
+ retmode="text"
386
+ )
387
+ record = SeqIO.read(handle, "fasta")
388
+ handle.close()
389
+ return str(record.seq)
390
+ except Exception:
391
+
392
+ url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
393
+ response = requests.get(url)
394
+ if response.status_code == 200:
395
+ try:
396
+ fasta_data = response.text
397
+ fasta_io = StringIO(fasta_data)
398
+ record = SeqIO.read(fasta_io, "fasta")
399
+ return str(record.seq)
400
+ except ValueError:
401
+ print(f"No se pudo convertir {fasta_data}, id: {refseq_id}")
402
+
403
+ # Main function to fetch sequences for a DataFrame
404
+ def _fetch_sequence_for_row(idx, row):
405
+ """
406
+ Helper to fetch sequence for a single row. Returns (idx, sequence).
407
+ """
408
+ sequence = None
409
+ # Try SwissProt ID
410
+ swiss_id = row.get('SwissProt_ID')
411
+ if swiss_id and not pd.isna(swiss_id):
412
+ try:
413
+ sequence = fetch_uniprot_sequence(swiss_id)
414
+ except HTTPError as e:
415
+ print(f"Warning: SwissProt fetch failed for {swiss_id} with HTTP {e.code}")
416
+ sequence = None
417
+
418
+ # Try RefSeq if no SwissProt
419
+ if not sequence and row.get('Refseq_Accession') and not pd.isna(row['Refseq_Accession']):
420
+ try:
421
+ sequence = fetch_refseq_sequence(row['Refseq_Accession'])
422
+ except HTTPError as e:
423
+ print(f"Warning: RefSeq fetch failed for {row['Refseq_Accession']} with HTTP {e.code}")
424
+ sequence = None
425
+
426
+ # Try Other_Accession if still no sequence
427
+ if not sequence and row.get('Other_Accession') and not pd.isna(row['Other_Accession']):
428
+ try:
429
+ sequence = fetch_refseq_sequence(row['Other_Accession'])
430
+ except HTTPError as e:
431
+ print(f"Warning: RefSeq fetch failed for {row['Other_Accession']} with HTTP {e.code}")
432
+ sequence = None
433
+
434
+ return idx, sequence
435
+
436
+
437
+ def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: int = None, max_workers: int = 5) -> pd.DataFrame:
438
+ """
439
+ Add a 'sequence' column to the dataframe by fetching sequences from
440
+ SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
441
+
442
+ Args:
443
+ df: Input DataFrame with ID columns.
444
+ batch_size: Optional size of row-chunks to process sequentially.
445
+ max_workers: Number of threads for parallel fetching.
446
+
447
+ Returns:
448
+ DataFrame with added 'sequence' column.
449
+ """
450
+ result_df = df.copy()
451
+ if 'sequence' not in result_df.columns:
452
+ result_df['sequence'] = None
453
+
454
+ total_rows = len(result_df)
455
+ # Determine batch indices
456
+ if batch_size and batch_size > 0:
457
+ starts = list(range(0, total_rows, batch_size))
458
+ else:
459
+ starts = [0]
460
+ batch_size = total_rows
461
+
462
+ # Overall progress bar
463
+ with tqdm(total=total_rows, desc="Retrieving sequences", unit="row") as pbar:
464
+ for start in starts:
465
+ end = min(start + batch_size, total_rows)
466
+ sub_df = result_df.iloc[start:end]
467
+ futures = []
468
+ # Launch parallel tasks
469
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
470
+ for idx, row in sub_df.iterrows():
471
+ futures.append(executor.submit(_fetch_sequence_for_row, idx, row))
472
+ # Collect results
473
+ for future in as_completed(futures):
474
+ idx, seq = future.result()
475
+ result_df.at[idx, 'sequence'] = seq
476
+ pbar.update(1)
477
+
478
+ print("Sequence retrieval complete")
479
+ success_count = result_df['sequence'].notna().sum()
480
+ print(f"Successfully retrieved {success_count} out of {total_rows} sequences "
481
+ f"({round(success_count/total_rows*100, 2)}%)")
482
+ return result_df
483
+
484
+ def esm_embed_sequence(model : Literal["esmc_300m", "esmc_600m"], sequence : str, device : str) -> None:
485
+
486
+ """
487
+ Embed a protein sequence using the specified ESM model.
488
+ Args:
489
+ model: Name of the ESM model to use.
490
+ sequence: Protein sequence to embed.
491
+ Returns:
492
+ LogitsOutput: Contains the embeddings and logits for the sequence.
493
+ """
494
+
495
+ client = ESMC.from_pretrained(model).to(device)
496
+
497
+ protein = ESMProtein(sequence=sequence)
498
+ protein_tensor = client.encode(protein)
499
+
500
+ if isinstance(protein_tensor, ESMProteinError):
501
+
502
+ raise protein_tensor
503
+
504
+ output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))
505
+
506
+ return output
507
+
508
+ def esm_save_emb(model: Literal["esmc_300m", "esmc_600m"],
509
+ seq_list: list[str],
510
+ id_list: list[str],
511
+ path: str,
512
+ device : Literal['cuda', 'cpu'] = 'cuda') -> None:
513
+
514
+ """
515
+ Save embeddings to disk.
516
+
517
+ Args:
518
+ model: ESM model name. Options are "esmc_300m" or "esmc_600m".
519
+ seq_list: List of protein sequences.
520
+ id_list: List of identifiers corresponding to the sequences.
521
+ path: Directory to save the embeddings.
522
+ """
523
+
524
+ assert len(seq_list) == len(id_list), "Sequence and ID lists must be the same length."
525
+ os.makedirs(path, exist_ok=True)
526
+
527
+ for i, (seq, acc) in enumerate(tqdm(zip(seq_list, id_list), total=len(seq_list), desc="Saving embeddings")):
528
+ try:
529
+ output: LogitsOutput = esm_embed_sequence(model=model, sequence=seq, device = device)
530
+ emb_array = output.embeddings.cpu().numpy()
531
+
532
+ if len(emb_array.shape) == 3:
533
+ emb_array = emb_array.squeeze(axis=0).mean(axis=0)
534
+ elif len(emb_array.shape) == 2:
535
+ emb_array = emb_array.mean(axis=0)
536
+
537
+ np.save(os.path.join(path, f"{acc}.npy"), emb_array)
538
+
539
+ except ESMProteinError as e:
540
+ print(f"Error processing {acc}: {e}")
541
+
542
+ if i % 100 == 0:
543
+ gc.collect()
544
+ torch.cuda.empty_cache()
545
+
546
+
547
+ def prost_embed_sequence(seq_list: list[str],
548
+ acc_list: list[str],
549
+ path: str,
550
+ device : Literal["cuda:0", "cpu"] = "cuda:0") -> None:
551
+
552
+ """
553
+
554
+ Embed protein sequences using ProstT5 and save embeddings.
555
+ Args:
556
+ model_name: Name of the ProstT5 model to use.
557
+ seq_list: List of protein sequences to embed.
558
+ acc_list: List of identifiers corresponding to the sequences.
559
+
560
+ """
561
+
562
+ assert len(seq_list) == len(acc_list), "Sequence and ID lists must match"
563
+
564
+ os.makedirs(path, exist_ok=True)
565
+
566
+ device = torch.device(device)
567
+
568
+ tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5")
569
+ model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
570
+ model = model.full() if device.type == 'cpu' else model.half()
571
+ model.eval()
572
+
573
+ for i, (seq, acc_id) in enumerate(tqdm(zip(seq_list, acc_list), total=len(seq_list), desc="Processing Sequences")):
574
+ try:
575
+ # Tokenize
576
+ ids = tokenizer(
577
+ seq,
578
+ add_special_tokens=True,
579
+ return_tensors='pt'
580
+ ).to(device)
581
+
582
+ # Forward pass
583
+ with torch.no_grad():
584
+ embedding_repr = model(
585
+ ids.input_ids,
586
+ attention_mask=ids.attention_mask
587
+ )
588
+
589
+ real_len = ids.attention_mask[0].sum().item() - 1
590
+ if real_len <= 0:
591
+ print(f"Sequence too short after tokenization for {acc_id}")
592
+ continue
593
+
594
+ emb = embedding_repr.last_hidden_state[0, 1:real_len]
595
+ emb_avg = emb.mean(dim=0).cpu().numpy()
596
+
597
+ np.save(os.path.join(path, f"{acc_id}.npy"), emb_avg)
598
+
599
+ del ids, embedding_repr, emb, emb_avg
600
+
601
+ except RuntimeError as e:
602
+ print(f"RuntimeError while processing {acc_id}: {e}")
603
+
604
+ if i % 100 == 0:
605
+ gc.collect()
606
+ torch.cuda.empty_cache()
607
+