jpuglia commited on
Commit
2941a1c
·
1 Parent(s): c2f42e2

Created CLI prototype, rearregement of files

Browse files
.gitattributes CHANGED
@@ -1,45 +1,2 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
36
- Plots/Embeddings/PCA_ESM300m.png filter=lfs diff=lfs merge=lfs -text
37
- Plots/Embeddings/PCA_ESM600m.png filter=lfs diff=lfs merge=lfs -text
38
- Plots/Embeddings/PCA_ProstT5.png filter=lfs diff=lfs merge=lfs -text
39
- Plots/Embeddings/UMAP_ESM300m.png filter=lfs diff=lfs merge=lfs -text
40
- Plots/Embeddings/UMAP_ESM600m.png filter=lfs diff=lfs merge=lfs -text
41
- Plots/Embeddings/UMAP_ProstT5.png filter=lfs diff=lfs merge=lfs -text
42
- Plots/Embeddings/t-SNE_ESM300m.png filter=lfs diff=lfs merge=lfs -text
43
- Plots/Embeddings/t-SNE_ESM600m.png filter=lfs diff=lfs merge=lfs -text
44
- Plots/Embeddings/t-SNE_ProstT5.png filter=lfs diff=lfs merge=lfs -text
45
- notebooks/EmbAnalisis.ipynb filter=lfs diff=lfs merge=lfs -text
 
1
+ Models/*.joblib filter=lfs diff=lfs merge=lfs -text
2
+ notebooks/*.ipynb filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
.gitignore CHANGED
@@ -1 +1,5 @@
1
- *.npy
 
 
 
 
 
1
+ *.npy
2
+ *.tab
3
+ __pycache__/
4
+ *.pyc
5
+ /home/juan/ProteinLocationPredictor/notebooks/__pycache__
Data/idmapping_2025_06_24_predictions.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ Sequence_ID,Predictions
2
+ sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.5908), CytoplasmicMembrane (0.2121), Periplasmic (0.1080), Extracellular (0.0750), OuterMembrane (0.0140), Cellwall (0.0000)
3
+ sp|P0A910|OMPA_ECOLI,OuterMembrane (0.9844), CytoplasmicMembrane (0.0069), Extracellular (0.0037), Cytoplasmic (0.0028), Periplasmic (0.0021), Cellwall (0.0000)
4
+ sp|P0A6F5|CH60_ECOLI,Cytoplasmic (0.7449), CytoplasmicMembrane (0.1760), Periplasmic (0.0376), Extracellular (0.0267), OuterMembrane (0.0145), Cellwall (0.0003)
5
+ sp|P02930|TOLC_ECOLI,OuterMembrane (0.9672), CytoplasmicMembrane (0.0185), Extracellular (0.0059), Periplasmic (0.0048), Cytoplasmic (0.0036), Cellwall (0.0000)
6
+ tr|Q9L1T3|Q9L1T3_STRCO,CytoplasmicMembrane (0.7330), Cytoplasmic (0.0996), Periplasmic (0.0820), Extracellular (0.0585), OuterMembrane (0.0260), Cellwall (0.0009)
{ProteinLocationPredictor/Models → Models}/rfESM300.joblib RENAMED
File without changes
{ProteinLocationPredictor/Models → Models}/rfESM600.joblib RENAMED
File without changes
{ProteinLocationPredictor/Models → Models}/rfProst.joblib RENAMED
File without changes
{ProteinLocationPredictor/Models → Models}/svm300.joblib RENAMED
File without changes
{ProteinLocationPredictor/Models → Models}/svmESM600.joblib RENAMED
File without changes
{ProteinLocationPredictor/Models → Models}/svmProst.joblib RENAMED
File without changes
ProteinLocationPredictor/.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ProteinLocationPredictor/README.md DELETED
@@ -1,3 +0,0 @@
1
- ---
2
- license: mit
3
- ---
 
 
 
 
notebooks/EDA_Psort.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
notebooks/ESMC_300m.ipynb CHANGED
@@ -1,421 +1,3 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "c409c4ad",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "from esm.models.esmc import ESMC\n",
11
- "from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput\n",
12
- "from esm.sdk.forge import ESM3ForgeInferenceClient\n",
13
- "import pandas as pd\n",
14
- "import os\n",
15
- "from concurrent.futures import ProcessPoolExecutor, as_completed\n",
16
- "from tqdm import tqdm\n",
17
- "import numpy as np\n",
18
- "import os\n",
19
- "import torch\n",
20
- "import gc"
21
- ]
22
- },
23
- {
24
- "cell_type": "code",
25
- "execution_count": 4,
26
- "id": "7f8f916c",
27
- "metadata": {},
28
- "outputs": [
29
- {
30
- "data": {
31
- "text/html": [
32
- "<div>\n",
33
- "<style scoped>\n",
34
- " .dataframe tbody tr th:only-of-type {\n",
35
- " vertical-align: middle;\n",
36
- " }\n",
37
- "\n",
38
- " .dataframe tbody tr th {\n",
39
- " vertical-align: top;\n",
40
- " }\n",
41
- "\n",
42
- " .dataframe thead th {\n",
43
- " text-align: right;\n",
44
- " }\n",
45
- "</style>\n",
46
- "<table border=\"1\" class=\"dataframe\">\n",
47
- " <thead>\n",
48
- " <tr style=\"text-align: right;\">\n",
49
- " <th></th>\n",
50
- " <th>SwissProt_ID</th>\n",
51
- " <th>Refseq_Accession</th>\n",
52
- " <th>Other_Accession</th>\n",
53
- " <th>GramStain</th>\n",
54
- " <th>Experimental_Localization</th>\n",
55
- " <th>Phylum</th>\n",
56
- " <th>Class</th>\n",
57
- " <th>Organism</th>\n",
58
- " <th>sequence</th>\n",
59
- " </tr>\n",
60
- " </thead>\n",
61
- " <tbody>\n",
62
- " <tr>\n",
63
- " <th>0</th>\n",
64
- " <td>P50307</td>\n",
65
- " <td>NaN</td>\n",
66
- " <td>NaN</td>\n",
67
- " <td>Gram positive</td>\n",
68
- " <td>Cytoplasmic</td>\n",
69
- " <td>Firmicutes</td>\n",
70
- " <td>Bacilli</td>\n",
71
- " <td>Staphylococcus aureus</td>\n",
72
- " <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
73
- " </tr>\n",
74
- " <tr>\n",
75
- " <th>1</th>\n",
76
- " <td>P01552</td>\n",
77
- " <td>NaN</td>\n",
78
- " <td>NaN</td>\n",
79
- " <td>Gram positive</td>\n",
80
- " <td>Extracellular</td>\n",
81
- " <td>Firmicutes</td>\n",
82
- " <td>Bacilli</td>\n",
83
- " <td>Staphylococcus aureus</td>\n",
84
- " <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
85
- " </tr>\n",
86
- " <tr>\n",
87
- " <th>2</th>\n",
88
- " <td>P09978</td>\n",
89
- " <td>NaN</td>\n",
90
- " <td>NaN</td>\n",
91
- " <td>Gram positive</td>\n",
92
- " <td>Extracellular</td>\n",
93
- " <td>Firmicutes</td>\n",
94
- " <td>Bacilli</td>\n",
95
- " <td>Staphylococcus aureus</td>\n",
96
- " <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
97
- " </tr>\n",
98
- " <tr>\n",
99
- " <th>3</th>\n",
100
- " <td>P45723</td>\n",
101
- " <td>NaN</td>\n",
102
- " <td>NaN</td>\n",
103
- " <td>Gram positive</td>\n",
104
- " <td>Extracellular</td>\n",
105
- " <td>Firmicutes</td>\n",
106
- " <td>Bacilli</td>\n",
107
- " <td>Staphylococcus aureus</td>\n",
108
- " <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
109
- " </tr>\n",
110
- " <tr>\n",
111
- " <th>4</th>\n",
112
- " <td>P81177</td>\n",
113
- " <td>NaN</td>\n",
114
- " <td>NaN</td>\n",
115
- " <td>Gram positive</td>\n",
116
- " <td>Extracellular</td>\n",
117
- " <td>Firmicutes</td>\n",
118
- " <td>Bacilli</td>\n",
119
- " <td>Staphylococcus aureus</td>\n",
120
- " <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
121
- " </tr>\n",
122
- " </tbody>\n",
123
- "</table>\n",
124
- "</div>"
125
- ],
126
- "text/plain": [
127
- " SwissProt_ID Refseq_Accession Other_Accession GramStain \\\n",
128
- "0 P50307 NaN NaN Gram positive \n",
129
- "1 P01552 NaN NaN Gram positive \n",
130
- "2 P09978 NaN NaN Gram positive \n",
131
- "3 P45723 NaN NaN Gram positive \n",
132
- "4 P81177 NaN NaN Gram positive \n",
133
- "\n",
134
- " Experimental_Localization Phylum Class Organism \\\n",
135
- "0 Cytoplasmic Firmicutes Bacilli Staphylococcus aureus \n",
136
- "1 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
137
- "2 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
138
- "3 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
139
- "4 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
140
- "\n",
141
- " sequence \n",
142
- "0 MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
143
- "1 MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
144
- "2 MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
145
- "3 MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
146
- "4 MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... "
147
- ]
148
- },
149
- "execution_count": 4,
150
- "metadata": {},
151
- "output_type": "execute_result"
152
- }
153
- ],
154
- "source": [
155
- "sequences: pd.DataFrame = pd.read_csv('../Data/trainingData.csv')\n",
156
- "sequences.head()"
157
- ]
158
- },
159
- {
160
- "cell_type": "code",
161
- "execution_count": null,
162
- "id": "07a49fd0",
163
- "metadata": {},
164
- "outputs": [],
165
- "source": [
166
- "client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_300m\").to(\"cuda\")"
167
- ]
168
- },
169
- {
170
- "cell_type": "code",
171
- "execution_count": null,
172
- "id": "e562c770",
173
- "metadata": {},
174
- "outputs": [],
175
- "source": [
176
- "# Set up output directories and metadata file.\n",
177
- "embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm300m/embeddings\")\n",
178
- "os.makedirs(embeddings_dir, exist_ok=True)"
179
- ]
180
- },
181
- {
182
- "cell_type": "code",
183
- "execution_count": null,
184
- "id": "294c6798",
185
- "metadata": {},
186
- "outputs": [],
187
- "source": [
188
- "# --- Your provided function ---\n",
189
- "def embed_sequence(client: ESM3ForgeInferenceClient, sequence: str) -> LogitsOutput:\n",
190
- " protein = ESMProtein(sequence=sequence)\n",
191
- " protein_tensor = client.encode(protein)\n",
192
- " if isinstance(protein_tensor, ESMProteinError):\n",
193
- " raise protein_tensor\n",
194
- " output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))\n",
195
- " return output\n",
196
- "\n",
197
- "\n",
198
- "def save_emb(dir: str, df: pd.DataFrame, client: ESM3ForgeInferenceClient) -> None:\n",
199
- " dir = os.path.expanduser(dir)\n",
200
- " os.makedirs(dir, exist_ok=True)\n",
201
- "\n",
202
- " for i in tqdm(df.index, desc=\"Embedding sequences\"):\n",
203
- " try:\n",
204
- " output: LogitsOutput = embed_sequence(client=client, sequence=df.loc[i, 'sequence'])\n",
205
- " embeddings_np: np.ndarray = output.embeddings.cpu().numpy()\n",
206
- "\n",
207
- " if not pd.isna(df.loc[i, 'SwissProt_ID']):\n",
208
- " identifier = df.loc[i, 'SwissProt_ID']\n",
209
- " elif not pd.isna(df.loc[i, 'Refseq_Accession']):\n",
210
- " identifier = df.loc[i, 'Refseq_Accession']\n",
211
- " elif not pd.isna(df.loc[i, 'Other_Accession']):\n",
212
- " identifier = df.loc[i, 'Other_Accession']\n",
213
- " else:\n",
214
- " identifier = f\"unknown_{i}\"\n",
215
- "\n",
216
- " file_path: str = os.path.join(dir, f\"{identifier}.npy\")\n",
217
- " np.save(file_path, embeddings_np)\n",
218
- "\n",
219
- " del output\n",
220
- " gc.collect()\n",
221
- " torch.cuda.empty_cache()\n",
222
- "\n",
223
- " except Exception as e:\n",
224
- " print(f\"Error embedding index {i}: {e}\")"
225
- ]
226
- },
227
- {
228
- "cell_type": "code",
229
- "execution_count": null,
230
- "id": "80db4990",
231
- "metadata": {},
232
- "outputs": [],
233
- "source": [
234
- "\n",
235
- " \n",
236
- "# Pass metadata_writer (and client if needed) to your function\n",
237
- "save_emb(embeddings_dir, sequences,client = client)\n"
238
- ]
239
- },
240
- {
241
- "cell_type": "code",
242
- "execution_count": null,
243
- "id": "77bf92c6",
244
- "metadata": {},
245
- "outputs": [],
246
- "source": [
247
- "sequences.loc[[11392]]"
248
- ]
249
- },
250
- {
251
- "cell_type": "code",
252
- "execution_count": 9,
253
- "id": "365d9fdb",
254
- "metadata": {},
255
- "outputs": [],
256
- "source": [
257
- "sequences = sequences.drop(index=11392)"
258
- ]
259
- },
260
- {
261
- "cell_type": "code",
262
- "execution_count": null,
263
- "id": "ad8a1990",
264
- "metadata": {},
265
- "outputs": [],
266
- "source": [
267
- "# Set up output directories and metadata file.\n",
268
- "embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm600m/embeddings\")\n",
269
- "os.makedirs(embeddings_dir, exist_ok=True)\n",
270
- "client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_600m\").to(\"cuda\")"
271
- ]
272
- },
273
- {
274
- "cell_type": "code",
275
- "execution_count": null,
276
- "id": "d42e5263",
277
- "metadata": {},
278
- "outputs": [],
279
- "source": [
280
- "save_emb(embeddings_dir, sequences,client = client)"
281
- ]
282
- },
283
- {
284
- "cell_type": "code",
285
- "execution_count": 2,
286
- "id": "df91fc10",
287
- "metadata": {},
288
- "outputs": [],
289
- "source": [
290
- "def load_single_embedding(row, id_col, path):\n",
291
- " try:\n",
292
- " emb = np.load(os.path.join(path, f\"{row[id_col]}.npy\"))\n",
293
- " emb = emb.squeeze(axis=0)\n",
294
- " emb = np.mean(emb, axis=0)\n",
295
- " return emb\n",
296
- " except Exception as e:\n",
297
- " print(f\"Error loading embedding {row[id_col]} due to {e}\")\n",
298
- " return None\n",
299
- "\n",
300
- "def load_emb_parallel(df: pd.DataFrame, id_col: str, path: str, max_workers=None) -> list:\n",
301
- " embeddings = []\n",
302
- " with ProcessPoolExecutor(max_workers=max_workers) as executor:\n",
303
- " futures = {\n",
304
- " executor.submit(load_single_embedding, df.loc[i], id_col, path): i for i in df.index\n",
305
- " }\n",
306
- "\n",
307
- " for future in tqdm(as_completed(futures), total=len(futures), desc=\"Loading embeddings\"):\n",
308
- " emb = future.result()\n",
309
- " if emb is not None:\n",
310
- " embeddings.append(emb)\n",
311
- " return embeddings\n",
312
- "\n"
313
- ]
314
- },
315
- {
316
- "cell_type": "code",
317
- "execution_count": 5,
318
- "id": "329701f6",
319
- "metadata": {},
320
- "outputs": [],
321
- "source": [
322
- "sequences['Preferred_ID'] = sequences['SwissProt_ID'].fillna(sequences['Refseq_Accession']).fillna(sequences['Other_Accession'])\n"
323
- ]
324
- },
325
- {
326
- "cell_type": "code",
327
- "execution_count": 6,
328
- "id": "9b720ff2",
329
- "metadata": {},
330
- "outputs": [
331
- {
332
- "name": "stderr",
333
- "output_type": "stream",
334
- "text": [
335
- "Loading embeddings: 97%|█████████▋| 11377/11691 [05:32<00:10, 31.20it/s]"
336
- ]
337
- },
338
- {
339
- "name": "stdout",
340
- "output_type": "stream",
341
- "text": [
342
- "Error loading embedding Q9I120 due to [Errno 2] No such file or directory: '/home/jpuglia/Documentos/Tesis/datosGenerados/esm600m/embeddings/Q9I120.npy'\n"
343
- ]
344
- },
345
- {
346
- "name": "stderr",
347
- "output_type": "stream",
348
- "text": [
349
- "Loading embeddings: 100%|██████████| 11691/11691 [05:40<00:00, 34.29it/s]\n"
350
- ]
351
- }
352
- ],
353
- "source": [
354
- "embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm600m/embeddings\")\n",
355
- "embeddings = load_emb_parallel(sequences, 'Preferred_ID',embeddings_dir)"
356
- ]
357
- },
358
- {
359
- "cell_type": "code",
360
- "execution_count": 15,
361
- "id": "765209e3",
362
- "metadata": {},
363
- "outputs": [
364
- {
365
- "name": "stdout",
366
- "output_type": "stream",
367
- "text": [
368
- "Embeddings count: 11690\n",
369
- "Sequences count: 11690\n"
370
- ]
371
- }
372
- ],
373
- "source": [
374
- "print(f\"Embeddings count: {len(embeddings)}\")\n",
375
- "print(f\"Sequences count: {len(sequences)}\")\n"
376
- ]
377
- },
378
- {
379
- "cell_type": "code",
380
- "execution_count": 17,
381
- "id": "63bf7f6c",
382
- "metadata": {},
383
- "outputs": [
384
- {
385
- "data": {
386
- "text/plain": [
387
- "(1152,)"
388
- ]
389
- },
390
- "execution_count": 17,
391
- "metadata": {},
392
- "output_type": "execute_result"
393
- }
394
- ],
395
- "source": [
396
- "embeddings[0].shape"
397
- ]
398
- }
399
- ],
400
- "metadata": {
401
- "kernelspec": {
402
- "display_name": "tesisEnv",
403
- "language": "python",
404
- "name": "python3"
405
- },
406
- "language_info": {
407
- "codemirror_mode": {
408
- "name": "ipython",
409
- "version": 3
410
- },
411
- "file_extension": ".py",
412
- "mimetype": "text/x-python",
413
- "name": "python",
414
- "nbconvert_exporter": "python",
415
- "pygments_lexer": "ipython3",
416
- "version": "3.10.16"
417
- }
418
- },
419
- "nbformat": 4,
420
- "nbformat_minor": 5
421
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7bcc1ec16a5d8992cfdb6ca4d61d8c69cad64b683a697f6622e0c1f0d921076
3
+ size 13125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/ESMC_600m.ipynb CHANGED
@@ -1,256 +1,3 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 2,
6
- "id": "c409c4ad",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "from esm.models.esmc import ESMC\n",
11
- "from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput\n",
12
- "from esm.sdk.forge import ESM3ForgeInferenceClient\n",
13
- "from esm.sdk import batch_executor\n",
14
- "import pandas as pd\n",
15
- "import os\n",
16
- "import csv\n",
17
- "import numpy as np\n",
18
- "import torch"
19
- ]
20
- },
21
- {
22
- "cell_type": "code",
23
- "execution_count": 3,
24
- "id": "7f8f916c",
25
- "metadata": {},
26
- "outputs": [
27
- {
28
- "data": {
29
- "text/html": [
30
- "<div>\n",
31
- "<style scoped>\n",
32
- " .dataframe tbody tr th:only-of-type {\n",
33
- " vertical-align: middle;\n",
34
- " }\n",
35
- "\n",
36
- " .dataframe tbody tr th {\n",
37
- " vertical-align: top;\n",
38
- " }\n",
39
- "\n",
40
- " .dataframe thead th {\n",
41
- " text-align: right;\n",
42
- " }\n",
43
- "</style>\n",
44
- "<table border=\"1\" class=\"dataframe\">\n",
45
- " <thead>\n",
46
- " <tr style=\"text-align: right;\">\n",
47
- " <th></th>\n",
48
- " <th>SwissProt_ID</th>\n",
49
- " <th>Experimental_Localization</th>\n",
50
- " <th>Organism</th>\n",
51
- " <th>Phylum</th>\n",
52
- " <th>Class</th>\n",
53
- " <th>GramStain</th>\n",
54
- " <th>Sequence</th>\n",
55
- " </tr>\n",
56
- " </thead>\n",
57
- " <tbody>\n",
58
- " <tr>\n",
59
- " <th>0</th>\n",
60
- " <td>P50307</td>\n",
61
- " <td>Cytoplasmic</td>\n",
62
- " <td>Staphylococcus aureus</td>\n",
63
- " <td>Firmicutes</td>\n",
64
- " <td>Bacilli</td>\n",
65
- " <td>1.0</td>\n",
66
- " <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
67
- " </tr>\n",
68
- " <tr>\n",
69
- " <th>1</th>\n",
70
- " <td>P01552</td>\n",
71
- " <td>Extracellular</td>\n",
72
- " <td>Staphylococcus aureus</td>\n",
73
- " <td>Firmicutes</td>\n",
74
- " <td>Bacilli</td>\n",
75
- " <td>1.0</td>\n",
76
- " <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
77
- " </tr>\n",
78
- " <tr>\n",
79
- " <th>2</th>\n",
80
- " <td>P09978</td>\n",
81
- " <td>Extracellular</td>\n",
82
- " <td>Staphylococcus aureus</td>\n",
83
- " <td>Firmicutes</td>\n",
84
- " <td>Bacilli</td>\n",
85
- " <td>1.0</td>\n",
86
- " <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
87
- " </tr>\n",
88
- " <tr>\n",
89
- " <th>3</th>\n",
90
- " <td>P45723</td>\n",
91
- " <td>Extracellular</td>\n",
92
- " <td>Staphylococcus aureus</td>\n",
93
- " <td>Firmicutes</td>\n",
94
- " <td>Bacilli</td>\n",
95
- " <td>1.0</td>\n",
96
- " <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
97
- " </tr>\n",
98
- " <tr>\n",
99
- " <th>4</th>\n",
100
- " <td>P81177</td>\n",
101
- " <td>Extracellular</td>\n",
102
- " <td>Staphylococcus aureus</td>\n",
103
- " <td>Firmicutes</td>\n",
104
- " <td>Bacilli</td>\n",
105
- " <td>1.0</td>\n",
106
- " <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
107
- " </tr>\n",
108
- " </tbody>\n",
109
- "</table>\n",
110
- "</div>"
111
- ],
112
- "text/plain": [
113
- " SwissProt_ID Experimental_Localization Organism Phylum \\\n",
114
- "0 P50307 Cytoplasmic Staphylococcus aureus Firmicutes \n",
115
- "1 P01552 Extracellular Staphylococcus aureus Firmicutes \n",
116
- "2 P09978 Extracellular Staphylococcus aureus Firmicutes \n",
117
- "3 P45723 Extracellular Staphylococcus aureus Firmicutes \n",
118
- "4 P81177 Extracellular Staphylococcus aureus Firmicutes \n",
119
- "\n",
120
- " Class GramStain Sequence \n",
121
- "0 Bacilli 1.0 MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
122
- "1 Bacilli 1.0 MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
123
- "2 Bacilli 1.0 MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
124
- "3 Bacilli 1.0 MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
125
- "4 Bacilli 1.0 MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... "
126
- ]
127
- },
128
- "execution_count": 3,
129
- "metadata": {},
130
- "output_type": "execute_result"
131
- }
132
- ],
133
- "source": [
134
- "sequences: pd.DataFrame = pd.read_csv('/home/jpuglia/Documentos/Tesis/tesisESM/Data/trainingData.csv')\n",
135
- "sequences.head()"
136
- ]
137
- },
138
- {
139
- "cell_type": "code",
140
- "execution_count": null,
141
- "id": "d7026979",
142
- "metadata": {},
143
- "outputs": [
144
- {
145
- "ename": "ValueError",
146
- "evalue": "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().",
147
- "output_type": "error",
148
- "traceback": [
149
- "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
150
- "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
151
- "\u001b[0;32m/tmp/ipykernel_118460/767462261.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0misfloat\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Sequence'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0msequences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;32mnot\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0misfloat\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
152
- "\u001b[0;32m~/miniconda3/envs/tesisEnv/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1575\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mfinal\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1576\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__nonzero__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mNoReturn\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1577\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1578\u001b[0m \u001b[0;34mf\"The truth value of a {type(self).__name__} is ambiguous. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1579\u001b[0m \u001b[0;34m\"Use a.empty, a.bool(), a.item(), a.any() or a.all().\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1580\u001b[0m )\n",
153
- "\u001b[0;31mValueError\u001b[0m: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()."
154
- ]
155
- }
156
- ],
157
- "source": [
158
- "isfloat: bool = sequences['Sequence'].apply(lambda x:isinstance(x,float))\n",
159
- "\n",
160
- "sequences = sequences[~isfloat]"
161
- ]
162
- },
163
- {
164
- "cell_type": "code",
165
- "execution_count": null,
166
- "id": "ea723ad9",
167
- "metadata": {},
168
- "outputs": [],
169
- "source": [
170
- "sequences = sequences.dropna()\n",
171
- "sequences = sequences.drop_duplicates()\n",
172
- "sequences.shape"
173
- ]
174
- },
175
- {
176
- "cell_type": "code",
177
- "execution_count": null,
178
- "id": "07a49fd0",
179
- "metadata": {},
180
- "outputs": [],
181
- "source": [
182
- "torch.cuda.empty_cache()\n",
183
- "client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_600m\").to(\"cuda\")"
184
- ]
185
- },
186
- {
187
- "cell_type": "code",
188
- "execution_count": null,
189
- "id": "294c6798",
190
- "metadata": {},
191
- "outputs": [],
192
- "source": [
193
- "# Set up output directories and metadata file.\n",
194
- "embeddings_dir = \"/home/jpuglia/Documentos/Tesis/datosGenerados/esm600m/embeddings\"\n",
195
- "os.makedirs(embeddings_dir, exist_ok=True)\n",
196
- "\n",
197
- "def embed_sequence(client: ESM3ForgeInferenceClient, sequence: str) -> LogitsOutput:\n",
198
- " \n",
199
- " protein = ESMProtein(sequence=sequence)\n",
200
- " protein_tensor = client.encode(protein)\n",
201
- " if isinstance(protein_tensor, ESMProteinError):\n",
202
- " raise protein_tensor\n",
203
- " output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))\n",
204
- " return output\n",
205
- "\n",
206
- "\n",
207
- "def save_emb(dir: str, df : pd.DataFrame) -> None:\n",
208
- " \n",
209
- " for i in df.index:\n",
210
- " \n",
211
- " output: LogitsOutput = embed_sequence(client = client, sequence = df.loc[i, 'Sequence'])\n",
212
- " \n",
213
- " embeddings_np : np.ndarray = output.embeddings.cpu().numpy()\n",
214
- " \n",
215
- " file_path : str = os.path.join(dir,f\"{df.loc[i, 'SwissProt_ID']}.npy\") \n",
216
- "\n",
217
- " np.save(file_path, embeddings_np)\n",
218
- " \n",
219
- " del output\n",
220
- " \n",
221
- " torch.cuda.empty_cache()"
222
- ]
223
- },
224
- {
225
- "cell_type": "code",
226
- "execution_count": null,
227
- "id": "80db4990",
228
- "metadata": {},
229
- "outputs": [],
230
- "source": [
231
- "save_emb(embeddings_dir, sequences)\n"
232
- ]
233
- }
234
- ],
235
- "metadata": {
236
- "kernelspec": {
237
- "display_name": "tesisEnv",
238
- "language": "python",
239
- "name": "python3"
240
- },
241
- "language_info": {
242
- "codemirror_mode": {
243
- "name": "ipython",
244
- "version": 3
245
- },
246
- "file_extension": ".py",
247
- "mimetype": "text/x-python",
248
- "name": "python",
249
- "nbconvert_exporter": "python",
250
- "pygments_lexer": "ipython3",
251
- "version": "3.10.16"
252
- }
253
- },
254
- "nbformat": 4,
255
- "nbformat_minor": 5
256
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06608effffc76bbb3ca65c67a84f06762d459b56a823bff9e61695cab83bb10c
3
+ size 10350
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/EmbAnalisis.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:1e21abaa9bc06181ad40648ad354596985d284daada49adc7d9c0d17daa6bce5
3
- size 10632399
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f56a416d1a8fb454ba368583013118d8fc490964dd036d3b3ce8c5879a4393b3
3
+ size 10635423
notebooks/ProstT5.ipynb CHANGED
@@ -1,526 +1,3 @@
1
- {
2
- "cells": [
3
- {
4
- "cell_type": "code",
5
- "execution_count": 1,
6
- "id": "40b1e04a",
7
- "metadata": {},
8
- "outputs": [],
9
- "source": [
10
- "import pandas as pd\n",
11
- "from transformers import T5Tokenizer, T5EncoderModel\n",
12
- "import torch\n",
13
- "import re\n",
14
- "from tqdm.notebook import tqdm\n",
15
- "import os\n",
16
- "import numpy as np\n",
17
- "import gc\n",
18
- "\n",
19
- "os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n"
20
- ]
21
- },
22
- {
23
- "cell_type": "code",
24
- "execution_count": 2,
25
- "id": "f4c8ff50",
26
- "metadata": {},
27
- "outputs": [
28
- {
29
- "data": {
30
- "text/html": [
31
- "<div>\n",
32
- "<style scoped>\n",
33
- " .dataframe tbody tr th:only-of-type {\n",
34
- " vertical-align: middle;\n",
35
- " }\n",
36
- "\n",
37
- " .dataframe tbody tr th {\n",
38
- " vertical-align: top;\n",
39
- " }\n",
40
- "\n",
41
- " .dataframe thead th {\n",
42
- " text-align: right;\n",
43
- " }\n",
44
- "</style>\n",
45
- "<table border=\"1\" class=\"dataframe\">\n",
46
- " <thead>\n",
47
- " <tr style=\"text-align: right;\">\n",
48
- " <th></th>\n",
49
- " <th>GramStain</th>\n",
50
- " <th>Experimental_Localization</th>\n",
51
- " <th>Phylum</th>\n",
52
- " <th>Class</th>\n",
53
- " <th>Organism</th>\n",
54
- " <th>sequence</th>\n",
55
- " <th>id</th>\n",
56
- " </tr>\n",
57
- " </thead>\n",
58
- " <tbody>\n",
59
- " <tr>\n",
60
- " <th>0</th>\n",
61
- " <td>Gram positive</td>\n",
62
- " <td>Cytoplasmic</td>\n",
63
- " <td>Firmicutes</td>\n",
64
- " <td>Bacilli</td>\n",
65
- " <td>Staphylococcus aureus</td>\n",
66
- " <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
67
- " <td>P50307</td>\n",
68
- " </tr>\n",
69
- " <tr>\n",
70
- " <th>1</th>\n",
71
- " <td>Gram positive</td>\n",
72
- " <td>Extracellular</td>\n",
73
- " <td>Firmicutes</td>\n",
74
- " <td>Bacilli</td>\n",
75
- " <td>Staphylococcus aureus</td>\n",
76
- " <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
77
- " <td>P01552</td>\n",
78
- " </tr>\n",
79
- " <tr>\n",
80
- " <th>2</th>\n",
81
- " <td>Gram positive</td>\n",
82
- " <td>Extracellular</td>\n",
83
- " <td>Firmicutes</td>\n",
84
- " <td>Bacilli</td>\n",
85
- " <td>Staphylococcus aureus</td>\n",
86
- " <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
87
- " <td>P09978</td>\n",
88
- " </tr>\n",
89
- " <tr>\n",
90
- " <th>3</th>\n",
91
- " <td>Gram positive</td>\n",
92
- " <td>Extracellular</td>\n",
93
- " <td>Firmicutes</td>\n",
94
- " <td>Bacilli</td>\n",
95
- " <td>Staphylococcus aureus</td>\n",
96
- " <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
97
- " <td>P45723</td>\n",
98
- " </tr>\n",
99
- " <tr>\n",
100
- " <th>4</th>\n",
101
- " <td>Gram positive</td>\n",
102
- " <td>Extracellular</td>\n",
103
- " <td>Firmicutes</td>\n",
104
- " <td>Bacilli</td>\n",
105
- " <td>Staphylococcus aureus</td>\n",
106
- " <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
107
- " <td>P81177</td>\n",
108
- " </tr>\n",
109
- " </tbody>\n",
110
- "</table>\n",
111
- "</div>"
112
- ],
113
- "text/plain": [
114
- " GramStain Experimental_Localization Phylum Class \\\n",
115
- "0 Gram positive Cytoplasmic Firmicutes Bacilli \n",
116
- "1 Gram positive Extracellular Firmicutes Bacilli \n",
117
- "2 Gram positive Extracellular Firmicutes Bacilli \n",
118
- "3 Gram positive Extracellular Firmicutes Bacilli \n",
119
- "4 Gram positive Extracellular Firmicutes Bacilli \n",
120
- "\n",
121
- " Organism sequence \\\n",
122
- "0 Staphylococcus aureus MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
123
- "1 Staphylococcus aureus MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
124
- "2 Staphylococcus aureus MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
125
- "3 Staphylococcus aureus MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
126
- "4 Staphylococcus aureus MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... \n",
127
- "\n",
128
- " id \n",
129
- "0 P50307 \n",
130
- "1 P01552 \n",
131
- "2 P09978 \n",
132
- "3 P45723 \n",
133
- "4 P81177 "
134
- ]
135
- },
136
- "execution_count": 2,
137
- "metadata": {},
138
- "output_type": "execute_result"
139
- }
140
- ],
141
- "source": [
142
- "sequences_df = pd.read_csv('../Data/trainingData.csv')\n",
143
- "sequences_df['id'] = sequences_df['SwissProt_ID'].fillna(sequences_df['Refseq_Accession'].fillna(sequences_df['Other_Accession']))\n",
144
- "sequences_df = sequences_df.drop(columns=['SwissProt_ID', 'Refseq_Accession', 'Other_Accession'])\n",
145
- "sequences_df.head()"
146
- ]
147
- },
148
- {
149
- "cell_type": "code",
150
- "execution_count": 3,
151
- "id": "6925775b",
152
- "metadata": {},
153
- "outputs": [
154
- {
155
- "name": "stdout",
156
- "output_type": "stream",
157
- "text": [
158
- "Secuencias 11691\n",
159
- "Ids 11691\n"
160
- ]
161
- }
162
- ],
163
- "source": [
164
- "sequences = list(sequences_df['sequence'].values)\n",
165
- "accession = list(sequences_df['id'].values)\n",
166
- "\n",
167
- "print(f\"Secuencias {len(sequences)}\\nIds {len(accession)}\")"
168
- ]
169
- },
170
- {
171
- "cell_type": "code",
172
- "execution_count": 4,
173
- "id": "c19ac1ba",
174
- "metadata": {},
175
- "outputs": [],
176
- "source": [
177
- "path = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/prost/embeddings\")"
178
- ]
179
- },
180
- {
181
- "cell_type": "code",
182
- "execution_count": 5,
183
- "id": "5b5e321e",
184
- "metadata": {},
185
- "outputs": [
186
- {
187
- "name": "stderr",
188
- "output_type": "stream",
189
- "text": [
190
- "You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
191
- ]
192
- },
193
- {
194
- "data": {
195
- "application/vnd.jupyter.widget-view+json": {
196
- "model_id": "17d989ac426c445dbfd209d0247a9a3d",
197
- "version_major": 2,
198
- "version_minor": 0
199
- },
200
- "text/plain": [
201
- "Processing Sequences: 0%| | 0/11691 [00:00<?, ?it/s]"
202
- ]
203
- },
204
- "metadata": {},
205
- "output_type": "display_data"
206
- },
207
- {
208
- "name": "stdout",
209
- "output_type": "stream",
210
- "text": [
211
- "Error CUDA out of memory. Tried to allocate 1.64 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.02 GiB is free. Including non-PyTorch memory, this process has 4.11 GiB memory in use. Of the allocated memory 4.00 GiB is allocated by PyTorch, and 10.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba CAC14227\n",
212
- "Error CUDA out of memory. Tried to allocate 1.54 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.09 GiB is free. Including non-PyTorch memory, this process has 4.03 GiB memory in use. Of the allocated memory 3.89 GiB is allocated by PyTorch, and 36.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba P12255\n",
213
- "Error CUDA out of memory. Tried to allocate 982.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 748.44 MiB is free. Including non-PyTorch memory, this process has 4.40 GiB memory in use. Of the allocated memory 4.25 GiB is allocated by PyTorch, and 51.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba P20471\n",
214
- "Error CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 711.88 MiB is free. Including non-PyTorch memory, this process has 4.48 GiB memory in use. Of the allocated memory 4.33 GiB is allocated by PyTorch, and 44.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba A64556\n",
215
- "Error CUDA out of memory. Tried to allocate 1.28 GiB. GPU 0 has a total capacity of 5.59 GiB of which 111.88 MiB is free. Including non-PyTorch memory, this process has 5.07 GiB memory in use. Of the allocated memory 4.90 GiB is allocated by PyTorch, and 67.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba AAF25576\n",
216
- "Error CUDA out of memory. Tried to allocate 1.55 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 4.03 GiB memory in use. Of the allocated memory 3.91 GiB is allocated by PyTorch, and 19.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q4L9P0\n",
217
- "Error CUDA out of memory. Tried to allocate 1.04 GiB. GPU 0 has a total capacity of 5.59 GiB of which 591.88 MiB is free. Including non-PyTorch memory, this process has 4.60 GiB memory in use. Of the allocated memory 4.45 GiB is allocated by PyTorch, and 40.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I5N6\n",
218
- "Error CUDA out of memory. Tried to allocate 1.49 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.22 GiB is free. Including non-PyTorch memory, this process has 3.95 GiB memory in use. Of the allocated memory 3.84 GiB is allocated by PyTorch, and 5.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I791\n",
219
- "Error CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 31.88 MiB is free. Including non-PyTorch memory, this process has 5.14 GiB memory in use. Of the allocated memory 5.01 GiB is allocated by PyTorch, and 36.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I120\n"
220
- ]
221
- }
222
- ],
223
- "source": [
224
- "# Setup device\n",
225
- "device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
226
- "\n",
227
- "# Load tokenizer and model\n",
228
- "tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)\n",
229
- "model = T5EncoderModel.from_pretrained(\"Rostlab/ProstT5\").to(device)\n",
230
- "model.full() if device == 'cpu' else model.half()\n",
231
- "\n",
232
- "# Clean sequences\n",
233
- "sequences = [\" \".join(list(re.sub(r\"[UZOB]\", \"X\", s))) for s in sequences]\n",
234
- "sequences = [ \"<AA2fold> \" + s for s in sequences]\n",
235
- "\n",
236
- "# Process each sequence individually\n",
237
- "for i, (seq, acc_id) in enumerate(tqdm(zip(sequences, accession), total=len(sequences), desc=\"Processing Sequences\")):\n",
238
- " try:\n",
239
- " # Tokenize\n",
240
- " ids = tokenizer(\n",
241
- " seq,\n",
242
- " add_special_tokens=True,\n",
243
- " return_tensors='pt'\n",
244
- " ).to(device)\n",
245
- "\n",
246
- " # Forward pass\n",
247
- " with torch.no_grad():\n",
248
- " embedding_repr = model(\n",
249
- " ids.input_ids,\n",
250
- " attention_mask=ids.attention_mask\n",
251
- " )\n",
252
- "\n",
253
- " # Compute actual length (excluding prefix)\n",
254
- " real_len = ids.attention_mask[0].sum().item() - 1\n",
255
- "\n",
256
- " # Extract and average embeddings\n",
257
- " emb = embedding_repr.last_hidden_state[0, 1:real_len]\n",
258
- " emb_avg = emb.mean(dim=0).cpu().numpy()\n",
259
- "\n",
260
- " # Save embedding using accession ID\n",
261
- " np.save(os.path.join(path, f\"{acc_id}.npy\"), emb_avg)\n",
262
- "\n",
263
- "\n",
264
- " # Cleanup\n",
265
- " del ids, embedding_repr, emb, emb_avg\n",
266
- " torch.cuda.empty_cache()\n",
267
- " gc.collect()\n",
268
- "\n",
269
- " except RuntimeError as e:\n",
270
- " print(f\"Error {e} mientras se procesaba {acc_id}\")\n",
271
- "\n"
272
- ]
273
- },
274
- {
275
- "cell_type": "code",
276
- "execution_count": 6,
277
- "id": "9ca9cb2d",
278
- "metadata": {},
279
- "outputs": [
280
- {
281
- "data": {
282
- "text/html": [
283
- "<div>\n",
284
- "<style scoped>\n",
285
- " .dataframe tbody tr th:only-of-type {\n",
286
- " vertical-align: middle;\n",
287
- " }\n",
288
- "\n",
289
- " .dataframe tbody tr th {\n",
290
- " vertical-align: top;\n",
291
- " }\n",
292
- "\n",
293
- " .dataframe thead th {\n",
294
- " text-align: right;\n",
295
- " }\n",
296
- "</style>\n",
297
- "<table border=\"1\" class=\"dataframe\">\n",
298
- " <thead>\n",
299
- " <tr style=\"text-align: right;\">\n",
300
- " <th></th>\n",
301
- " <th>GramStain</th>\n",
302
- " <th>Experimental_Localization</th>\n",
303
- " <th>Phylum</th>\n",
304
- " <th>Class</th>\n",
305
- " <th>Organism</th>\n",
306
- " <th>sequence</th>\n",
307
- " <th>id</th>\n",
308
- " </tr>\n",
309
- " </thead>\n",
310
- " <tbody>\n",
311
- " <tr>\n",
312
- " <th>1532</th>\n",
313
- " <td>Gram negative</td>\n",
314
- " <td>OuterMembrane,Extracellular</td>\n",
315
- " <td>Proteobacteria</td>\n",
316
- " <td>Gammaproteobacteria</td>\n",
317
- " <td>Yersinia pestis</td>\n",
318
- " <td>MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQ...</td>\n",
319
- " <td>CAC14227</td>\n",
320
- " </tr>\n",
321
- " <tr>\n",
322
- " <th>1683</th>\n",
323
- " <td>Gram negative</td>\n",
324
- " <td>OuterMembrane</td>\n",
325
- " <td>Proteobacteria</td>\n",
326
- " <td>Betaproteobacteria</td>\n",
327
- " <td>Bordetella pertussis</td>\n",
328
- " <td>MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATS...</td>\n",
329
- " <td>P12255</td>\n",
330
- " </tr>\n",
331
- " <tr>\n",
332
- " <th>1767</th>\n",
333
- " <td>Gram negative</td>\n",
334
- " <td>CytoplasmicMembrane</td>\n",
335
- " <td>Proteobacteria</td>\n",
336
- " <td>Alphaproteobacteria</td>\n",
337
- " <td>Sinorhizobium meliloti</td>\n",
338
- " <td>MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKG...</td>\n",
339
- " <td>P20471</td>\n",
340
- " </tr>\n",
341
- " <tr>\n",
342
- " <th>4089</th>\n",
343
- " <td>Gram negative</td>\n",
344
- " <td>OuterMembrane,Extracellular</td>\n",
345
- " <td>Proteobacteria</td>\n",
346
- " <td>Epsilonproteobacteria</td>\n",
347
- " <td>Helicobacter pylori</td>\n",
348
- " <td>MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLW...</td>\n",
349
- " <td>A64556</td>\n",
350
- " </tr>\n",
351
- " <tr>\n",
352
- " <th>4623</th>\n",
353
- " <td>Gram positive</td>\n",
354
- " <td>Cellwall</td>\n",
355
- " <td>Firmicutes</td>\n",
356
- " <td>Bacilli</td>\n",
357
- " <td>Lactobacillus reuteri</td>\n",
358
- " <td>MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGAT...</td>\n",
359
- " <td>AAF25576</td>\n",
360
- " </tr>\n",
361
- " </tbody>\n",
362
- "</table>\n",
363
- "</div>"
364
- ],
365
- "text/plain": [
366
- " GramStain Experimental_Localization Phylum \\\n",
367
- "1532 Gram negative OuterMembrane,Extracellular Proteobacteria \n",
368
- "1683 Gram negative OuterMembrane Proteobacteria \n",
369
- "1767 Gram negative CytoplasmicMembrane Proteobacteria \n",
370
- "4089 Gram negative OuterMembrane,Extracellular Proteobacteria \n",
371
- "4623 Gram positive Cellwall Firmicutes \n",
372
- "\n",
373
- " Class Organism \\\n",
374
- "1532 Gammaproteobacteria Yersinia pestis \n",
375
- "1683 Betaproteobacteria Bordetella pertussis \n",
376
- "1767 Alphaproteobacteria Sinorhizobium meliloti \n",
377
- "4089 Epsilonproteobacteria Helicobacter pylori \n",
378
- "4623 Bacilli Lactobacillus reuteri \n",
379
- "\n",
380
- " sequence id \n",
381
- "1532 MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQ... CAC14227 \n",
382
- "1683 MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATS... P12255 \n",
383
- "1767 MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKG... P20471 \n",
384
- "4089 MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLW... A64556 \n",
385
- "4623 MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGAT... AAF25576 "
386
- ]
387
- },
388
- "execution_count": 6,
389
- "metadata": {},
390
- "output_type": "execute_result"
391
- }
392
- ],
393
- "source": [
394
- "cpu_ids = [\n",
395
- " 'CAC14227',\n",
396
- " 'P12255',\n",
397
- " 'P20471',\n",
398
- " 'A64556',\n",
399
- " 'AAF25576',\n",
400
- " 'Q4L9P0',\n",
401
- " 'Q9I5N6',\n",
402
- " 'Q9I791',\n",
403
- " 'Q9I120'\n",
404
- "]\n",
405
- "\n",
406
- "cpu_sequences = sequences_df[sequences_df['id'].isin(cpu_ids)]\n",
407
- "cpu_sequences.head()\n"
408
- ]
409
- },
410
- {
411
- "cell_type": "code",
412
- "execution_count": 7,
413
- "id": "a919beeb",
414
- "metadata": {},
415
- "outputs": [
416
- {
417
- "name": "stdout",
418
- "output_type": "stream",
419
- "text": [
420
- "['MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQSMVSKLFRKNLLALSLGSIVFLSTGPVFAADITVSTQAELSAALSNGTYDKIILGADITLIGSLTVNMTSNQVVIDGQGKFGLTVNNTTNYGLVVSSGSGTLTLQNMSKIDSANYYSMVVLNGANTAVNVIYNNIDFLGSSQLIYMGAYGAATNSIMTFGDILNDVVVNDRAQEIGEVNKLAFTGRFHVTHTGSSVTSFVSTGGANNTSTMDFASGADVKIDRTGSTGDLTSTGVNAFAYTFADGASFELIANQNVFSGTTTNRGLEIGSYNSIDGFGSGVKIVLQSRSDGSIISGNGIDNATTNAAGINNNASGDANVIYNLGTGSILKATNTGILATKNANNASDIYIRSAGDITAATGISATHNGTGTVKIKNDGTITSTTAGIAISSASIKEISVDNTDGTITATAGTGVNVLASAILNLFGGTINTSATANGITFAGTEGGHTLTDLTINLLGTGIALSNVAGVNLTLSNVTLNTLNGTALNSLTGLTLVDSLNGRNTINIEGAGIGIAATNTELNTFDAEALDINVNGAGIGIQATGGGVNLSASNLIINVANTLGTALQITDGIDNTTTIGNEIQLNAENATAINFLGSSSKTLNNNGTIKGSVIFAGVADHIINNNGTLDGTLTTGAGNDTLVLDSSSQSNDVINLGDGNNSVTIQNGATVSSIITGNGNDTFTINGMSVGSTYLGSLDAGTGLNTXNXXASTDELAAATSLQGFTNINLVDSHITLVSDDNIGSGMVNIDSSSELLFGSTFDGILHATLGAGTGSAIVNNSANVSLEQASMFAGTWQVNQGGALTASNSNQLGSAKIGLDGTLNLDNIALFNHVLTGNGTLNVAKNLATTAFDFGSTVGGAFSGIVNLTKTTFALSADNAAALASATLKLSDDSVTTVGTTDRTLHGLDLSGGTLIFDGAVPQSQTSGVVTVTDLALNSGTVNITGSGSWDNTDPLATNVSILEQDRAGSTLELINATNVTGDIDALDLLVNGTAITSGTQGVQSAIQQGGSTVANAIHNYGLASSNSNGDSGLYVNYTLSALELLADGADALLLATESGLTANRVLNAELFGVGGLVVDAQNGALTLANGSNRYEGTTTVTAGELILGANGAFGQTSLLDIASGASANINGYSQTVGAVTNVGTVTLGSGGVLTSGLLTNGGILDLTGGALNLTXGGASTVAGGLTGAGTLNINGGNLSVSAANSGLSGQTHIADVASVTLTDTGTLGTSAVEVLGTLNLNGANAAMTNVLSGDGTINTNAAVTLSGNNSFSGAHQIGTDGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGAGDTLSLSGFNGTFGNSVTGSGVLQVTDDAEVTLTSSNGVSNAVTIDIADATLNLDDIALFNHVLTGNGLLNVAKNDASTAFDFGSTVGGAFSGIVNLTNTTFALSADNAAALARATLKLSDDSVTTVGATDRTLHGLDLNGGTLIFDGSPPQSQANGVVTVTDLALNSGTISITGAGNWENEHPVTPPNVSLLEQDRGDILLELINAANVTGNANNLDLLVDGTAITSGTQGVESAIQQGGSTVANAIHNYGLTSSNGNGGSGLYVNYTLSALELLANGANALLLATESGLTANRVLNAELFGVGGLVVDAQNGALTLANGNNRYEGTTTVTAGELILGANGAFGQTSLLNIASGASANINGYRQTVGAVTNSGAVTLGNGGVLTSGLLTNGGILDLTGGALNLAAGGSSTVAGGLTGAGTLNINGGDLAVSATNSGLSGQTHIADVASVTLTGTGTLGTSAVEVLGTLNLNGANAAMTNVLSGGGVINTNAAVTLSGNNSFSGAHQIGTDGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGAGDTLSLSGFNXTFGNSVTGSGVLQVTDDAEVTLTSSNXVGNTVKVDIADATLYVNDIALLDHVLTENGTLNVAKYLATTAFDXGSTVGXXFSGIVNLTNTTFALSADNAAALARATLKLSDDSVTTVGTTDRILHGLDLNGGTLIFDGSPPQSQANGVVTVTDLALNSGTISITGAGNWENEHPVTPPNVSLLEQDRGDILLQLIDADNVTGNANDLELMINGTTISAGQGVQSTVQQGGYTVANATHNYGMTSNGGSGLYVNYTLSALELLADGANALLLATESGLTANRELNAELSGVGGLVVDAQNGALTLANGNNRYEGTTTVTAGELILGANGAFGQTSLLNIASGASANINGYRQTVGAVTNTGTVTLGNGGELTSTDTLINTGMINVTDGILNLENGGASSISGGLTGNGILNIKGGDFTISIDNNGLAGQTNISDGASVTLGNGGTIIGTGNLGSSVIDVLGDLNLVADNSLANVISGDGTINTTATVTLSGNSSFSGAHQIGTNGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGTGDTLSLSGFNGTFGNSVTGSGVLQVTDDAEVTLTSSNGVSNAVTIDIADATLNLDDIALFNHALTGNGLLNVAKNDASTAFDFGATVGGAFTGTVNLNNSTFDLSGNNTTVLAQATLKLSSGNLTSVGNGVQNIGTLAMNGGTLLFDNIVDNAGIITSDGTIAANSINTTGGGEVRVNLPSNLAPSLDGLSVMELDEGEIIVTLATGAATGTGHELTLTDENGDPISAVTYQGVHNAGSTSAAATGSFNYGMTTGEDYDGLYVNYGLTALELLSTGSEALVLTAILANNGTQSNDLSAQITGSGDLAFASANDGSTASLSNSTNSYTGTTWVSSGNLRLDADSALGQTSLLAMSTATHVDINGTQQVVGELATEGGSTLDLNDGKLTVTGGGQIDGALTGGGELVLSGGLLNVSYDNAGFTGSTDIANGAVAHLSQAQGLGNGTINNNGTLHLDNTIGTLFNALTGSDGEVLLSNNASVQLAGDNSGYSGLFTNQAGSILIANSAEHLGGSSIANSGALILDTGSVWELTNTISGTGTLVKRGSGTVKIEGDTVSAGLTTIEEGLLQLGSSAVTQTLSLEESLQERALLVSFASNMANLTSNVLITANGSLGGYGQVTGNVENYGNLIMPNALTGGDFGTFTIDGNYTGDEGMITFNTILAGDTSVTDRLVITGDTAGQSYVTVNNIGGVGARTFEGIKIIDVGGDSAGQFTLNGRAVGGAYEYFLYQGGASTPDDGNWYLRTEADDRRPEPASYTANLAAANNMFVTSLADRMGETLYTDVFTGEQKTTSLWLRNEGSHNRSRDDSGELKTQDNRYVMQLGGDVAQWSRNAQDLWRVGVMAGYANSSSSTVAQVAGYRSTGSVDGYSVGIYGSWLADNADDTGAYVDSWVQYSWFDNRVSGQDLATEKYDSKGFTASVEGGYAFKVGESVNQSYFIQPKAQVVWMGVKADDHTETNGTVISGDGNGNIQTRLGAKAFINPSDKAKVSGPAFKPFVEANWIHNTKDFGTTLDGVTVKQAGTANIAELKLGVDGQVNSQLNLWGNIGQQVGNKGYSETSVVLGVKYNF', 'MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATSLSVAPNALAWALMLACTGLPLVTHAQGLVPQGQTQVLQGGNKVPVVNIADPNSGGVSHNKFQQFNVANPGVVFNNGLTDGVSRIGGALTKNPNLTRQASAILAEVTDTSPSRLAGTLEVYGKGADLIIANPNGISVNGLSTLNASNLTLTTGRPSVNGGRIGLDVQQGTVTIERGGVNATGLGYFDVVARLVKLQGAVSSKQGKPLADIAVVAGANRYDHATRRATPIAAGARGAAAGAYAIDGTAAGAMYGKHITLVSSDSGLGVRQLGSLSSPSAITVSSQGEIALGDATVQRGPLSLKGAGVVSAGKLASGGGAVNVAGGGAVKIASASSVGNLAVQGGGKVQATLLNAGGTLLVSGRQAVQLGAASSRQALSVNAGGALKADKLSATRRVDVDGKQAVALGSASSNALSVRAGGALKAGKLSATGRLDVDGKQAVTLGSVASDGALSVSAGGNLRAKQLVSSAQLEVRGQREVALDDASSARGMTVVAAGALAARNLQSKGAIGVQGGEAVSVANANSDAELRVRGRGQVDLHDLSAARGADISGEGRVNIGRARSDSDVKVSAHGALSIDSMTALGAIGVQAGGSVSAKDMRSRGAVTVSGGGAVNLGDVQSDGQVRATSAGAMTVRDVAAAADLALQAGDALQAGFLKSAGAMTVNGRDAVRLDGAHAGGQLRVSSDGQAALGSLAAKGELTVSAARAATVAELKSLDNISVTGGERVSVQSVNSASRVAISAHGALDVGKVSAKSGIGLEGWGAVGADSLGSDGAISVSGRDAVRVDQARSLADISLGAEGGATLGAVEAAGSIDVRGGSTVAANSLHANRDVRVSGKDAVRVTAATSGGGLHVSSGRQLDLGAVQARGALALDGGAGVALQSAKASGTLHVQGGEHLDLGTLAAVGAVDVNGTGDVRVAKLVSDAGADLQAGRSMTLGIVDTTGDLQARAQQKLELGSVKSDGGLQAAAGGALSLAAAEVAGALELSGQGVTVDRASASRARIDSTGSVGIGALKAGAVEAASPRRARRALRQDFFTPGSVVVRAQGNVTVGRGDPHQGVLAQGDIIMDAKGGTLLLRNDALTENGTVTISADSAVLEHSTIESKISQSVLAAKGDKGKPAVSVKVAKKLFLNGTLRAVNDNNETMSGRQIDVVDGRPQITDAVTGEARKDESVVSDAALVADGGPIVVEAGELVSHAGGIGNGRNKENGASVTVRTTGNLVNKGYISAGKQGVLEVGGALTNEFLVGSDGTQRIEAQRIENRGTFQSQAPAGTAGALVVKAAEAIVHDGVMATKGEMQIAGKGGGSPTVTAGAKATTSANKLSVDVASWDNAGSLDIKKGGAQVTVAGRYAEHGEVSIQGDYTVSADAIALAAQVTQRGGAANLTSRHDTRFSNKIRLMGPLQVNAGGAVSNTGNLKVREGVTVTAASFDNETGAEVMAKSATLTTSGAARNAGKMQVKEAATIVAASVSNPGTFTAGKDITVTSRGGFDNEGKMESNKDIVIKTEQFSNGRVLDAKHDLTVTASGQADNRGSLKAGHDFTVQAQRIDNSGTMAAGHDATLKAPHLRNTGQVVAGHDIHIINSAKLENTGRVDARNDIALDVADFTNTGSLYAEHDATLTLAQGTQRDLVVDQDHILPVAEGTLRVKAKSLTTEIETGNPGSLIAEVQENIDNKQAIVVGKDLTLSSAHGNVANEANALLWAAGELTVKAQNITNKRAALIEAGGNARLTAAVALLNKLGRIRAGEDMHLDAPRIENTAKLSGEVQRKGVQDVGGGEHGRWSGIGYVNYWLRAGNGKKAGTIAAPWYGGDLTAEQSLIEVGKDLYLNAGARKDEHRHLLNEGVIQAGGHGHIGGDVDNRSVVRTVSAMEYFKTPLPVSLTALDNRAGLSPATWNFQSTYELLDYLLDQNRYEYIWGLYPTYTEWSVNTLKNLDLGYQAKPAPTAPPMPKAPELDLRGHTLESAEGRKIFGEYKKLQGEYEKAKMAVQAVEAYGEATRRVHDQLGQRYGKALGGMDAETKEVDGIIQEFAADLRTVYAKQADQATIDAETDKVAQRYKSQIDAVRLQAIQPGRVTLAKALSAALGADWRALGHSQLMQRWKDFKAGKRGAEIAFYPKEQTVLAAGAGLTLSNGAIHNGENAAQNRGRPEGLKIGAHSATSVSGSFDALRDVGLEKRLDIDDALAAVLVNPHIFTRIGAAQTSLADGAAGPALARQARQAPETDGMVDARGLGSADALASLASLDAAQGLEVSGRRNAQVADAGLAGPSAVAAPAVGAADVGVEPVTGDQVDQPVVAVGLEQPVATVRVAPPAVALPRPLFETRIKFIDQSKFYGSRYFFEQIGYKPDRAARVAGDNYFDTTLVREQVRRALGGYESRLPVRGVALVAKLMDSAGTVGKALGLKVGVAPTAQQLKQADRDFVWYVDTVIDGQKVLAPRLYLTEATRQGITDQYAGGGALIASGGDVTVNTDGHDVSSVNGLIQGRSVKVDAGKGKVVVADSKGAGGGIEADDEVDVSGRDIGIEGGKLRGKDVRLKADTVKVATSMRYDDKGRLAARGDGALDAQGGQLHIEAKRLETAGATLKGGKVKLDVDDVKLGGVYEAGSSYENKSSTPLGSLFAILSSTTETNQSAHANHYGTRIEAGTLEGKMQNLEIEGGSVDAAHTDLSVARDARFKAAADFAHAEHEKDVRQLSLGAKVGAGGYEAGFSLGSESGLEAHAGRGMTAGAEVKVGYRASHEQSSETEKSYRNANLNFGGGSVEAGNVLDIGGADINRNRYGGAAKGNAGTEEALRMRAKKVESTKYVSEQTSQSSGWSVEVASTASARSSLLTAATRLGDSVAQNVEDGREIRGELMAAQVAAEATQLVTADTAAVALSAGISADFDSSHSRSTSQNTQYLGGNLSIEATEGDATLVGAKFGGGDQVSLKAAKSVNLMAAESTFESYSESHNFHASADANLGANAVQGAVGLGLTAGMGTSHQITNETGKTYAGTSVDAANVSIDAGKDLNLSGSRVRGKHVVLDVEGDINATSKQDERNYNSSGGGWDASAGVAIQNRTLVAPVGSAGFNFNTEHDNSRLTNDGAAGVVASDGLTGHVKGDANLTGATIADLSGKGNLKVDGAVNAQNLKDYRDKDGGSGGLNVGISSTTLAPTVGVAFGRVAGEDYQAEQRATIDVGQTKDPARLQVGGGVKGTLNQDAAQATVVQRNKHWAGGGSEFSVAGKSLKKKNQVRPVETPTPDVVDGPPSRPTTPPASPQPIRATVEVSSPPPVSVATVEVVPRPKVETAQPLPPRPVAAQVVPVTPPKVEVAKVEVVPRPKVETAQPLPPRPVVAEKVTTPAVQPQLAKVETVQPVKPETTKPLPKPLPVAKVTKAPPPVVETAQPLPPVKPQKATPGPVAEVGKATVTTVQVQSAPPKPAPVAKQPAPAPKPKPKPKPKAERPKPGKTTPLSGRHVVQQQVQVLQRQASDINNTKSLPGGKLPKPVTVKLTDENGKPQTYTINRREDLMKLNGKVLSTKTTLGLEQTFRLRVEDIGGKNYRVFYETNK', 'MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKGTSALPGFFPFEFRARHRENEKEILRVYRATAADVEAGASITPAAEWLLDNHHVVEEAIQEVRRDFPRRFYRQLPTLSVSGTVIPRTMALAWLYVAHTHSTVTRESITAMVEGFQEHETLKIGELWALPSILRFVLIENLRRIAIRVERSRGMRRKANEVADQLIRLNDPEGCRTLLVESEALAADNTFIAQLLYRMRDGSQSSGAVIAWIEERLERRGTDVEEALVAEQNRLSSGNATMSNIIRSLREIDDTDWAVWFESVSKIDATLREGSDYAALDFGSRNTYRDTIEKLARRSGHSEHEVTEIAIEMVEEAKAAAAVEAPLQEPNVGSFLVGKQRLALEKRIGYSPSIFQHLIRSVRKLDWFAIAGPNILLTILAMIVVYAFVSPMDIPSGAKLIMLLLFALPASEGAMGLFNTVFTLFAKPSRLVGYEFLDGIPEDARTLVVVPCLIAKRDHVDELVRNLEVHYLANPRGEIYFALLSDWADSKSEEAPADTDVLEYAKREIASLSARYAYDGKTRFFLLHRRRLYNEAEGVWMGWERKRGKLHELNLLLRGDRDTSFLQGANMVPEGVQYVMTLDSDTRLMRDAVTKLVGKLYHPINRPVVNPRTQEVVTGYSLLQPRVTPSLTTGSEASAFQRIFTINRGIDPYVFTVSDVYQDIAGEGSFTGKGLYHVDAFEAALKSRIEENAVLSHDLLEGSYARCALVTDIELVEDFPIRYEVEMSRQHRWARGDWQLLPYIFNPKNGLSMLGRWKMYDNLRRSLIPVAWLAASVMGWYYMEPTPALIWQLVLIFSLFVAPTLSLISGIMPRRNDIVARAHLHTVLSDIRAANAQVALRIVFIAHNAAMMADAIVRSLYRTFVSRKLMLEWRTAAQVQSAGHGSIGDYFRAMWTAPALALVSLALAAISDTGLPFIGLPFALIWAASPAVAWFVSQSAETEDQLVVSEEAIEEMRKIARRTWRYFEAFVTAEQNFLPPDNFQETPQPVLAERTSPTNIGVYLLSVMSARSFGWIGFEETITRLEQTIATIDRMPKYRGHLFNWYRTRGLEPMEPRYVSSVDSGNLAGHLIAVSSMCREWAEAPSAHVQGNLDGIGDVAAILKEALNELPDDRKTVRPLRRLVEERIAGFQNALAAVKRERELASIRVINLAVLARDMHKLTVNLDHEVRTVQSGEVATWAGSLVAACEAHIADGVFDLGAIEALRQRLLVLKERARDIAFSMDFSFLFRPERRLLSIGYRVNANELDEACYDLLASEARLTSLFAIAKGDLPTEHWYKLGRPIVPIGARGALVSWSGSMFEYLMPPLVMQERQGGILNQTNNLVVQEQINHGRRLGTPWGISEAAFNARDHELTYQYTNFGVPTLGLKRGLGQNAVIAPYASILACMYDPKSALANLARLREVGALGAYGYHDAVDFTPTRVPEGQKCAVVRNYYAHHHGMSVAAVANVVFNGQLREWFHADPVIEAAELLLQEKAPRDIPVMAAKREPEALGKGQADLLRPEVRVVEDPINQDRETVLLSNGHYSVMLTATGAGYARWNGQSVTRWTPDPVEDRTGTFIFLRDTVTGDWWSATAEPRRAPGEKTVTRFGDDKAEFVKTVGDLTSEVECIVATEHDAEGRRVILLNTGTEDRFIEVTSYAEPVLAMDDADSSHPTFSKMFLRTEISRHGDVIWVSRNKRSPGDPDIEVAHLVTDNAGSERHTQAETDRRRFLGQGRTLAEAAAFDPGATLSGTDGFTLDPIVSLRRVVRVPAGKKVSVIFWTIAAPDREGVDRAIDRYRHPETFNHELIHAWTRSQVQMRHVGITSKEAASFQMLGRYLVYPDMHLRADAETVKTGLASQSALWPLAISGDFPIFCLRINDDGDLGIAREALRAQEYLRARGITADLVVVNERASSYAQDLQHTLDSMCENLRLRGLSDGPRQHIFAVRRDLMEPETWSTLISASRAVFHARNGTISDQIARATSLYSKSSEKKEEGAEMLLPVIREADARTAVELDGGDLDFWNGFGGFAEDGREYAVRLRGGEATPQPWINVISNEQFGFHVSAEGAAFSWSRNSRDYQLTPWTNDAVVNRPGEAIFVRDMASGAVLTPYAALSRRKSALFETRHGLGYSRFLSTQDELEIEAMHTVHRTLPAKLVRLTIRNRSSAARKLRVYGYAEWVLGNNRSRTAPFVLSEWDESAKTLVATNPYSIDYPGRCAFFASDGDIAGYTASRREFLGRAGGILAPQAVISGAELTGSTDVDGDACAALATDITVEAGVERQVTFFLGDADNPDQVRAVLEELRADSFGAALEAAKAFWGDFTGVVKVETPDRAFNHMINHWLPYQALGCRIMARSAFYQASGAFGFRDQLQDTLAFLIHRPALARAQILNAAARQFVEGDVQHWWLPGTDAGVRTMISDDVVWLAHAVAHYCAVTGEEDILKEKVPFITGPALEEGQHDSFYKPDVADEVGDVYEHCARALDLAIHRTGANGLPLILGGDWNDGMNRVGEAGEGTSVWLGWFLAGTLRAFLPYARARKDKPRVALWERHLEALKDALEQAGWDGDYYRRGYYDDDTPLGSAENGECRIDSIAQSWSTLSGEGDKERSLRAMDAVMAELVDPEKRIVRLFTPPLETTKQDPGYIKAYPPGVRENGGQYTHAATWVVLAFAAQERAEEAWRTFRMLNPVSHALSQVDAEHYRVEPYVVAADIYGEGALAGRGGWTWYTGSAGWLYRAGVEGILGIRKRGDKLLIRPVLPSEWPGYSAEVRVNGTTHRISVSRDSKSGEPVVSVNNSVTKNAHEGVLL', 'MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLWDLLNPKVGGEYVHWVKGSQYCAWWEFAGCLKNVWGANHKGYDAGNAANYLSSQNYQAISVGSGNETGTYSLSGFTNYVGGNLTINLGNSVVLDLSGSNSFTSYQGYNQGKDDVTFTVGAINLNGTLEVGNRVGSGAGTHTGTATLNLNANKVNINSNINAYKTSQVNIGNANSVITIGSVSLSGDVCSSLASVGIGANCSTSGPSYSFKGTTNATNTAFSNASGSFTFEENATFSGAKWNGGTYTFNKEFSATNNTAFSSGSFNFKGVSSFNGTSFSNASYTFDNQATFQNSSFNGGTFTFNNQTNPTNNAQHPQIQNSSFSGNATTLKGFVNFQQAFNNSNHQLTIQNASFNNATFNNTGKITIEKDASFNNTTFNTSVDTNNMSVTGGVTLSGKNDLKNGSTLDFGSSKITLAQGTTFNLTSLGSEKSVTILNSSGGITYSNLLNHAINGLTSALKTNESLSNPQSFAQGLWDIITYNGVTGQLLNENAATSKPTDSSPSKSSTNSTQVYQVGYKIGDTIYKLQETFSHNSIIIQALESGTYTPPPVINGSKFDLSASNYINADMPWYDHKYYIPKSQNFTESGTYYLPSVQIWGSYTNSFKQTFSANGSNLVIGYNSTWTDHNVSSSGTVSFGDTSGSALNGHCGPWPYYQCTGTTNGTYSAYHVYITANLRSGNRIGTGGAANLIFNGVDSINIANATITQHNAGIYSSSMTFSTQSMDNSQNLNGLNSNGKLSVYGTTFTNEAKDGKFIFNAGQAVFENTNFNGGSYQFSGDSLNFSNNNQFNSGSFEISAKNASFNNANFNNSASFNFNNSNATTSFVGDFTNANSNLQIAGNAVFGNSTNGSQNTANFNNTGSVNISGNATFDNVVFNGPTNTSVKGQVTLNNITLKNLNAPLSFGDGTITFNAHSVINIAESITNGNPITLVSSSKEIEYNNAFSKNLWQLINYQGHGASSEKLVSSAGNGVYDVVYSFNNQTYNFQEVFSQNSISIRRLGVNMVFDYVDMEKSDHLYYQNALGFMTYMPNSYNNNLGNANNTIYYYDKSIDFYASGKTLFTKAEFSQTFTGQNSAIVFGAKSIWTSLSDAPQSNTIIRFGDNKGAGSNDASGHCWNLQCIGFITGHYEAQKIYITGSIESGNRISSGGGASLNFNGLQGILLTNATLYNRAAGTQSSSMNFISNSANIQAQNSYFIDDTAQNGGNPNFSFNALNLDFSNSSFRGYVGKTQSVFKFNAKNAISFTNSTNLSSGLYQMQAKSVLFDNSNLSVSVGTSSIKANAINLSQNASINASNHSTLELQGDLNVNDTSSLNLNQSTINVSNNATINDYASLIASNGSHLNFNGAVNFNSANITTSLNNSSIVFKGAVSLGGQFNLSNNSSLDFQGSSAITSNTAFNFYDNAFSQSPITFHQALDIKAPLSLGGNLLNPNNSSVLDLKNSQLVFGDQGSLNIANIDLLSDLNDNKNRVYNIIQADMNSNWYERISFFGMHINDGIYDAKNQTYSFTNPLNNALKITESFKDNQLSVTLSQIPGIKNTLYNIGSEIFNYQKVYNNANGVYSYSDDAQGVFYLTSNVKGYYNPNQSYQASGSNNTTKNNNLTSESSIISQTYNAQGNPISALHIYNKGYNFNNIKALGQMALKLYPEIKKVLGNDFSPSSLNALNSNALNQLTKLITPNDWKNINELIDNANNSVVQNFNNGTLIVGATQIGQTDTNSAVVFGGLGYQTPCDYTDIVCQKFRGTYLGQLLESSSADLGYIDTTFNAKEIYLTGTLGSGNAWGTGGSASVTFNSQTSLILNQANIVSSQTDGIFSMLGQEGINKVFNQAGLANILGEVAVQSINKAGGLGNLIVNTLGSNSVIGGYLTPEQKNQTLSQLLGQNNFDNLMNDSGLNTAIKDLIRQKLGFWTGLVGGLAGLGGIDLQNPEKLIGSMSINDLLSKKGLFNQITGFISANDIGQVISVMLQDIVKPSNALKNDVAALGKQMIGEFLGQDTLNSLESLLQNQQIKSVLDKVLAAKGLGPIYEQGLGDLIPNLGKKGLFAPYGLSQVWQKGDFSFNAQGNVFVQNSTFSNANGGTLSFNAGNSLIFAGNNHIAFTNHAGTLQLLSDQVSNINITTLNASNGLKINAANNNVSVSQGNLFVSASCAQQSDPTTANIANPCALSAQSTNGASSNNASNNAPIALSNNDESLMVAANDFNFSGNIYANGVVDFSKIKGSANIKNLYLYNNAQFQANNLTISNQAVLEKNASFVTNNLNIQGAFNNNATQKIEVLQNLVIASNASLSTGIYGLEVGGALNNSGAIHFNLENTQTPTPLIQAEGIINLNTTQTPFMNVNNSMANNTTYTLLKSSRYIDYNINPNSLQSYLNLYTLININGNHIEEKNGALTYLGQRVLLQDKGLLLSVALPNSNNASQNNILSLSVLYNQVKMSCGDKAMDFTPPTLQDYIVGIQGQSALNQIEAVGGNAIKWLSTLMMETKENPFFAPIYLKNHSLNEILGVTKDLQNTASLISNPNFRDNATNLLELASYTQQTSRLTKLSDFRSREGESDFSLLELKNKRFSDPNPEVFVKYSQLSKHPNNLWVQGVGGASFISGGNGTLYGLNAGYDRLVKNVILGGYVAYGYSDFNGNIMHSLGNNVDVGMYARAFLKRNEFTLSANETYGGNATSINSSNSLLSVLNQRYNYNTWTTSVNGNYGYDFMFKQKSVVLKPQVGLSYHFIGLSGMKGNDAAYKQFLMHSNPSNESVLTLNMGLESRKYFGKNSYYFVTARLGRDLLIKSKGSNTVRFVGENTLLYRKGEVFNTFASVITGGEMHLWRLVYVNAGVGLKMGLQYQDINITGNVGMRVAF', 'MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGATAQAATTESNASAKTEQVVQQNSTSAASDSTSTSNSSAAVSTSSATPVSTESASSMTVSDLPASASAASDNQASAANASESSSQSASSSVASDAAATVSKDSQAASEANSQSAADVETVQLPTSAANANANESQAANILGAQAVQKAANQQAPAGFTVTDPNYPAEMYKDPDASHYTYWWAQSSNGEYNLVLSTDRNGDGKVYVFLLGNNNNVLGKYTVDKNKSTEVATDDEGDFGTVYNDGQSGVFVTSDGTWKSKFNVFDPKAGEDDGDYGSISFMIPQVETQTTTYVTYFDSKGNKVDKPIEVSDPVIQKGLDGQIYTTKGGKVINGYFAKEPKNAHGFMSPFGKQGAIYTKDWHDGLKATFTETDTKTGLMHVVVKHYYHSWGWGTWRTVKEFDLAPGQSEKVDYDVYKSVTIHSIYIPQTINIQYTYEKLGNLVISSDSKSFPAEDKTQYPNDKSDSTKAGNVTIPKVAGFTPTINDKTVTNYTFNPSDYVSDLSKDINVVYVADTQEAAISFYDETDHKPLNDQTIQLTGKTGEKISHTEANQTLAKLGKQGYVVDQNTFADDATYDNDTQAPQEFTIYLKHDTTHTDATSSKADQKTVSETIHYVYKDGVNANKPVADDANTTVTFKRGYTTDKVTGKIVSYDPWTVDGKQADSKTFDAVKSPVIAGYTADQAEVAAQTVTPDSQNINKTVYYTADTQEAAINFYDETGHKLLDNQTIHLTGKTGEKVDRTQADQTLADLVKQGYVLDKENTAKAFPADAVYDNNDQTPQEFTIYLKHGTTHTDATSSKADQKTVSETIHYVYKDGVNANKPVADDANTTVTFKRGYTTDKVTGKIVSYDPWTVDGKQADSKTFDAVKSPVIAGYTADQAEVAAQTVTPDSQNINKTVYYTADTQEAAINFYDETGHKLLDNQTIHLTGKTGEKVDRTQADQTLAELEKQGYVLDENNTKLGFPSNAAYDDDDVKPQEFTIYLKHGMTHTDATDKNAEQKIVTETIHYVYENNQTAKTDYTSAVDFKRGYTTDNVTHKIISYDPWMVSSKKFGFVKSPAIEGYTPNHSQIDEITVTPDSKDVVKTVVYVGNAQEAQAIFYDETTGKEISGTREIATGKTDETISFTKDPNEVVKELEKQGYVFDKDNAKNNVFVAGTAYDKNSEVHQYFKYYLKHGHATVTPDQDPQKGQKTVTQTIKYEYADGTATGLADNVQTLTFKRTGDKDLVTHEVTWPDWSTVAGQQTSVVTSPALKGYTADTNEIPAITYHAGDSDVTYVVKYNADVQHAVINYIDGESDEILHTDKVNGHSDEKINYSTADMIKQLEAKGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKTDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQTAYVKYVDDTTGETLRQDDLHGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKTDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQTAYVKYVDDTTGETLRQDDLHGYTDETIPYSTAEGIKKYEGDGYVLVSDGFKPGTKFGVGTPTYEVHFKHGMTHTDATDKNAEQKTVTETIHYVDENNQTVQPDSTTAVTFKRGYTTDNVTGKVVSYDPWTVDGNQADSKTFAAVPSPAVEGYTPNHQQINEFTVTPDSKDIVKTVVYVGDPQEAQAIFYDETTGKEISNTREIVNGKTDETIGFTKDPNEVVKELEKQGYVFDKDNANNNVFAAGTTYDKNSEVHQYFKYYFTHATTIVTPDNPKTPADVLPDNPGKNYPSGVAKDDLNKTVTRTINITTPDGKTQTITQKAEFTRSATVDEVTGEVTYGPWSKNVVLESVDVPNISGYVPSASVPEITVTPNDQDMTINITYKKLDSGKAADQGGNASNGGQATNGGSTTGQSAQNGQSGQTQNNAGAQQLPQTGNANNEKGALGLASAMFAAGLGLGFGSKKKCHED', 'MSRKERNFKRFFGQEKARVKLYKSGKQWVKAGIREVQLLKVLGLPFLNKDVEQINNLDTNKDKNFKNQAMKATGLAGGAFTFAMLNDHHAYAASETPMTSEIASNSETVANQNSTTVTKSETSTTEYISSQTSTSQDATSSTNSTEKSTSSSTTDSQTSTDSTSDKSTSNSEKQDSSMSNSDTKASSSSTTDNSTSNNSTTSEKDTNSQANTTSTDSQKGSTSTNDNSITSTSTKDNQIRKNSTESNSITASNSTSDSNSGSTVSTNSTTSQLTSTSESQINTDLGSTLLVSDSTSTSTSTAPLKLRTFSRLATTTFAAAAATSTTNTYTGAGTDTNYNIPIYYKLTTVNNGTSMTFTYTVTYDNPATTTVERPTALSNSYAIYNTGTTNQTMFTLGSAYGTPSTATSYITDSTGAQVSNPRANTTNINKQGSGYTWANGYQMNGAQAKQGYGLTTTWTVPINSSGDTSFTFNPYSTSVTGGTNFFNGKKVTVTDPTSTANSQSASTSTANSQSASTSKSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSTSTSTSTSTANSQSTSTSTSTSVSDSTSASTSLSGSTSTSVSDSTSASTSLSDSASTSVSDSTSASTSLSASTSTSESDSTSASTSLSESTSTSLSDSLSASTSLSDSASTSVSDSTSASTSLSGSESASLSDSASASTSLSESTSTSESTSTSESDSTSASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSESTSTSLSDSASASTSLSESTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSESTSTSLSDSASASTSLSESTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSDSASTSTSVSDSTSASTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSESTSTSVSDSTSASTSLSDSASTSVSDSTSASTSLSESTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSASTSTSVSDSTSASTSLSGSTSTSESDSTSMSTSLSGSESTSLSDSLSASTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSTSISTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISRSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISGSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSESTSLSDSASASTSLSASTSTSVSDSTSTSTSDSVSTSTSMSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTNTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSESTSTSVSDSTSTSTSLSGSESTSLSDSASASTSLSASTSTSVSDSTSTSTSDSVSTSTSMSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISGSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSGSTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSSSESTSTSVSDSTSASVSTSISTSISMSESSSTSASTSDSTSTSASTSESRSASHSMSGTDSNNTSSSDSKSHSISNSDSNTTSDSASASTSISDSSSTSTSDSNASHSFSTSHSVSESNSMSTSHSQFDSISTSESMSGTDSTSLSTSLSHSASTSNSTSMTTSESQSNNDSQMHSNSLHHDAKDELPDTGDSDSNSTGLVSAVAAMLAGLGLFGKSRKNKKDKKNKGSEQ', 'MPSRSPSSARSSRALYAPRLKPLAQAIALLLVAGGAQAAGQPFSAAWFAAKGAAQGGAAGAPRPGAQLPGAPPPLAQQQRVNQQLQRSMANLNNTVAAIAAQQAAQAAGRQAALNLPQDVPDGLGEGGLKVDASLPFEQAWQNAKGPVQTQAAGKTTVSIEQTADKAVLNWETFNVGRNTTVDFQQHADWALLNRVNDPSARPSQIQGQIKADGTVMLVNRNGVVFSGSSQVDVRNLTVAAANISDEQFRQRGLYYDNAGSRPTFTDAAGAVRVEQGAQLRTAAPSGSTRGGGYVLLLGSEVDNAGSIVTPKGQTVLSAGDSFVIRRGQGTDGNLTSTTRGNEVLPGFAADSSAGRVRNSGLVQAATGDISLSGREVEQAGVLLSSSSVDSRGTLHLKASERITLAEGATSAILVDSSGSAALDSQREALLKPLNGSSAAVSRGDDDRRDLSRVEIDSAGSVDFRDGSITLASGGQVAVNAGQRALLRDGAVIDVSGAQGVQVAMETNSIKVNVRGNEQRDASVNREGGGLNSQDVWVDVRDLVRVPAGTNGYASDRWYTAGGLLEVGGYLGTQGHSAGEWMAQGGIVSFTGNDVVTQAGSQINLSGGTLDVQGGYIRQTWLKGSDGRLYELGSAPGDLLYDGIYRGYEAHSERWDQTRYFYNPLIAPTQRYENGYSVGRDAGSLVIGSANARLDGQVVGDTYRGERQTEAPQAGLDGYNQSQNAVARGAQLVVGRYTPYYVKSSGLLEYALGADAGSLKQVVIGTGEVAAEEPTLDAPVAAERQGRLSLDSELLNGFQLGGLKVAAGESIRVDSALTLANGGEAILFANDVAIDADITAHGGSLQAGNVLAQISPNGTIDGFVDAGREAGILRVGDGVRLDASGLWSNLLLAPEDNDTLAYRDGGRISLRSGGDLSLGQGSLLDVSSGAALLADGKRLGGRGGDIALHASAGLAQASDGQLQLGGTLNGLGTSGAGTLSLQSGKVRIGGDDLGDGSLQLAEDFFQQGFASYRVVGRSGLTVAEDAQVRVARPVYRFASGAGEVAAGEAPREALEAWIPPLYLEDALAGRLVQREGADLYLQAGGDGNILGQLDPASQTLELGRGSLVEVDPGRAIVLRGPGQITLDGILNAWGGRIDVRQQQFGALDVTQDNQPKAQGQPHARSIWIGEQALLDVAGRAVTALDGRGRRYGEVQSGGSIVIGGEIDPGKAIATSADAFVIVRPGARLEASGSQAQLDVPGLGRVLLAGDGGRIALSSYNGLYLDGSLRAAAGGSGAAGGSLEIIADAPLYQGFTVVDDRVLAMRELILTAGHADSGLPTLLQPGMDDSALRYGQSRVGTQSLTGGGFDQLSLFSNGPLSFEGNIDLAMGRSLNLYAGTIAATGGGPSEVKLQAPYVRLSGIGMYGQQASGEFRPRLTYGPTATAEQVRLQVSAGRLLDIAGRLSFGSDGVINGVNAEAVRYQRPGFEKVTLRSEGDLRFAGDYPENGDPSGRLITHGDLQLTAAQLYPVTGASSTLYAGYGLDEGGQAVFDAERHLAIERSGESLPDTPLSVFGSLAFMASNIEQGGVVRAPLGLIQFGSNLDRAPGTVRLLPGSLTSVSGAELVMPYGGTTDGINYLVNQVPIQLTGAGGALAAGTLVAGVGLYASEVDVQQGARLDLSGGGELAGAGFISGRGGSTDARFHPLVQQDNDGFRLPELSSNPVYAIVPGHQAVSAPLGGEAGAIQPLVGQQVTIGDGVPGLAAGTYTLLPSTYALLPGAFRVEINGLAGQGAPMATQGLRNGSWATSGQLSIAGTSIRDSLSRQVILSSADTLRRYSQYNEMSYADFIRADAARKNIPRAMLPVDARSLYLGLRADEELRENALSFEGKVDFTPEESGYGGSLIVDAEAGIEILPEGGLPDSDFAGVSLVADDLNAIGASRIAIGTLPYVEYGEQGNFVQFGGSNRLFPVVLRKGAHLSAPEVIIGRDITLEGGSGISTLGKGKTAYDSSDGFIYQPGGRNLLLLSNGWLNLLAPAADSSLPVRLGGCAEGAGCADTELYSEGTLGIATNGTVTFGDNVRYGTRNLSLALSTINIGSSQSLADAAARGVLPNGLALDQTVLQRLLRGERGAGIPALENLILSARDAVNIYGSVSLDTYDPATGKSSLANLVLGTPAIYGHGTGEDVASIRTASLVWSGSSQPAAAPVAGGAGSGSGTLRVDAERITLGYGANTQPAGETDEARLALGFAEVQLNASERISANHKGSLRVYQRLDGYVAGEGLRYSGGDLRLSTPLLTGEAGSLSRISSGGSLSLAAPAGAAAVTFDSGTAGLGAELSLSAREIRLDSAVSLPSGKLSLSAEDDLELGDGARIDLAGRKASFNDVDKYSWGGDLLLSSRAGDIRQAAGSLIDLSARNNRGGTLSAVALAEDAGVVDLQGRILGGASGDYDAGGTRVPFLGGELEIRAQRLGDGGSLSEQFTALNQRLNQGEVFGARRFQLKQGDLQIGDGLKAHRIEVSLDNGQLGVSGTVDASGAQVGEIRLAGGRGLSLGGNALLDAHGSLLRRDSYGQIIDSPNRAMVELSSGSGTLVLAGGARIDLRHGTAAPAEQVDGVARGTLELNAPRLGGVSAGDIAIDASGALDIRGAGSIALNAMQRYDDAPWGNDPAAGGRSYQVIDQAYLDARHAESSAFIAAALANRELLDGKLAGLTNATYADAFHLRPGVEIVSATADGDLVVQGDLDLSGYRYASLNPNTPLTEVYGSGEVGALVLRAGGDLNLYGSINDGFAPPPDSPDDKGWILTPGVQPFGGDLVVPGPGVVLGDGTAFLGGRTLNYDLPIKGTTLAAGTRLATEAVLEQPYTLAAGSVLVADIHDAAGTLLYAAGSLLRDGVTLEVGSRLGAGTLLAAPASVQAMTWPAGVPLPSILREGPSRPNVLLLNGELALARGSLIPSQTEVVLAGDAPFIELRPSDGVRQGRNWALAEMLPAGSQSWSMRLVAGADLAAADNRLVRPDSSASLNLADTHYQAKIEQSSGGLVFTDQATDWGITPGTPVDESNEWICGLGPYCAEPPRWTWAPGNYLGMPAGTAIGEGDLWWCSVDPSLCIENLGKTVVTPQNQLFSVLRTGTGDLDLASAGNLTQWSPYGVYTAGTQAADVATGFNQPRGLFNGSVLGAGGADYEVLSTSQYQAWYPEHGGNLDIAVGGDVVGDQWAEKLTSSDPIRPLPPSAAVGNWLWRQGSADREGVPTAWWVNFGSYVRGAEGDAPYLVGFTGFGTLGGGNLSMRTGGDAGNIAPRGDGSIPSSGNLNPRSQGLVLAVAGTGRLTSDGALQLGGGGDLNVRIGGEVNPSREARATQTYSSSGFDGLYSGGTIHDLQGALINLRGSASLYSGALGGIDPRYDTLLRDPAEVRSRDAFSPTLASSTGGLTLVAGDTGMRLETRGDLVLGGVTDPGRVGVPNTVGFTAPDGSVYQGGGIAWFSLWTAHTSIDLFAAGGNLTPSTQLVEATNAIPMAGRNLSPSDGRFIYPSIVRAAAPEGSIYLGPSSGDMGGVSLNVSTTPYSLLLAPSLNGELELLAGDSIYAGGYSVQRSGADPANLPSIWTPAFAGYSDAALLNPIAGNGSPDGNPAVIGGLPLFYFGPDSAASLARDLQPARFYALTGDIVGLNSGAQIRFGEQAGNRAGQTWYEGAGPVWMRAGRDIVASGTPLGQRISAPSQISTDASFTGNLFVHDDPNDLSLVQAGRDILYGNFNVAGPGTLEISAGRNILMEDRAAITSLGAVVPGDSRPGADIVLQAGAAGADYQAFLERYLDPANLAQAGTPLAEQPGKVVRTYESELAKWLNERFGFAGDAEQAQAFFAGLPAEQQRIFARQVYFAELRAGGREYNEVGGVRQGSYLRGRNAIAALFPERDPAGNPISYEGDIVMYGGAGVHTDFGGDIQLLSPGGRQVFGIEGEAPPSTAGIVTQGQGDIQAYSRDSILLGQSRIMTTFGGSILAWSAEGDINAGRGSQTTVVYTPPRRIYDAWGNVSLSPQVPSTGAGIATLNPIPEVAPGDIDLIAPLGTIDAGEAGIRVSGNVNVAALQVVNAANIQTQGQSSGIPLVASVNTGALTSASAAASSATQAAEDVSRQQQAAARQRMPSVITVQVLGFGNERLEPSRDGASRSPGYNPDSAVQVLGAGALGEQARSQLTDEERGNLIL', 'MDIRSPLNQCIALSLAGILFLNPIVAAAAGLALDKAAGGNTGLGQAGNGVPIVNIATPNDAGLSNNHFRDYNVGANGLILNNATGKTQGTQLGGIILGNPNLKGQAAQVILNQVTGGNRSTLAGYTEVAGQSARVIVANPHGITCQGCGFINTPRATLTTGKPIMDGQRLERFQVDGGDIVVEGAELNVGNLEQFDLITRSAKLNAKLYAKNLNIVTGRNDVQADSLQATPRAADGSEKPQLAIDSSALGGMYAGAIRLVGTEQGVGVRLAGDMAASGGDIRIDASGKLSLAQASSQGDLKIAAQAVELNGKTYAGGSAEIRSAEELVNRQSLAARERIVLEAAHIDNAGVIEAGVEPDERRNARGDLELRSGTLRNAGSLVASRALEAKASQALDNQGGSLKGATVRVDAGHLDNRGGKLLAEGELRVEASSLDNRQDGLLQSRDRAVVKTRGDLDNRGGQVIGLNDLEVGAATLDNGQQGLLGSQQSTRVSAQALVNRGDGEVSGKRVEARVGSLDNRGGKLIGDDLLVVASGAIDNRLGLFSAANRLDLRARSLDNSGKGTLSSRGGLEVSLGGLLDNRDEGNLLSQGAQRVTVGQLDNRAGGLLSSRSELNVHGASLDNRGGVLVADAGLSATGGAFDNRDGGSASGKAGVRVEVASLRNDQGGKLLSDGRLDLAANAVGNAGGRIAAKGDLQATLGSLAQQGGELVSEKTLKVAADTLDNSQSGLIAANGGIAIEARQVDNRAGEISSTSKVAVNAREQLDNRGGKVIGDSGLRLTVQRLLNQAKGVLAGRDGLSLDGGELFNGDGGRLDSQNSLSVSLGGVLDNQGGALVSEGSLTARAARLDNRGGTFSSAGALALTSQAALDNQGGRLLSDAGVTLQGASLDNSRSGVISAKGAVDIRTGVLDNSRNGGIGSNAGITLVAARLDNGQQGRVSAKGLLDANLKGLDQRGGGVLISETGVTLDLNGGTLVNRDGGLIATPGALLLRQLGAVDNGAGGEISSDRAFTLAAASLDNRGGRLIGAANLTLRIAQALDNSLAGVISGAAGLDIAAARLDNSAKGTLASRAGIDLRVDGALDNHAEGTVSGARLTLASASLDNSGKGLLSGNAGLSVATGALDNAEGGQLISQGVLDVSSADLDNRGGALSGKQSLRLSAANLDNRGGLLTSDGELELTAGRVDSADGGEISARGDLRLTVERLVQRQGRLVGERGVSLDLRGGDLDNQGGLISARGPLSIERLSVLDNRQGGEISSQQGFELLARRIDNGQQGRIISAGKLRLDADALGNAGAGLLSGWQGLTVTGGSLDNSAGGTLSSKDGELAISLGGALDNHGQGALVSKGAQRIDAASLDNAQGIVSGESDVTLSIAGKLDNGQGGLVSAQRALSFERDDTLLNNAGGRINGGSLLLKGASLDNSDGQLISQGRLDAILGGALVNTGAARLASGGDLLLRSASVDNRGGKLVSQGLLEISAGSLDNSASGTLASQAGMSLRLGGGALRNQQDGLIFSQAGALDVQAGSLDNRQGTLQAQGDNRLRIGGALDNQGGRLDSRAGNLDLQSGSLDNGAGGVLNSAKGWLKLVTGLFDNSAGVTQAQSLEIRAGQGVRNQQGHLSALGGDNRIVTADFDNQGGGLYASGLLSLDGQRFLNQGAAAGQGGKVGAGRIDFSLAGALANRFGQLESESELHLRAAAIDNSGGSLRALGRSGSTRLVAGGLNNAYGVLESANQDLDLQLGSLANAGGRILHTGNGTFGLDSGQVIRAGGELTTNGLLDIRASEWTNSSVLQAGRLNLDIGTFRQTAEGKLLAVQSFTGRGGDWSNDGLLASDGSFRLDLSGGYRGNGRATSLGDFALNAASLDLGNAASLAGGANVTLGAGNLLVNRGRITAAGDLVASAASLNNYGTLGGGGNLRLNAPALLNERGLLFSGADMTLRAGDITNLYGDVYSLGRLDIARDDAGNRAASLRNLSGVIESGKDFSLRASLIENRRAVLESKSGLYTAKMEQTACIEGVNAGDCSGKRNAIWTITQRDKTEVTASSAMGQLLAGGDFAIDGGTLNNLSSLIGSGGNLTANLEVLDNQGLETGELETIRVLRTARGGDIGGIDQKSRNFTNLYWYQSANFDPARAGEIPAALNAILSDWSFEYEFPSKGPTPISSGDQSYAAVIQAAGDVTVNASTRIDNGVTRPGYTFVGSGRQVGDSAVGGSGVSVVVPLTSQLPPDLARRQVNPVTLPGFSLPQGDNGLFRLSSRFAEDGNGSAALGAGADRTQGGSGVSVGQQGAGNAAGTWQGQGVRVDGLAGAANVQGQGGSTLGGSLPGVARVQGVPGNATPSASHKYLIETNPALTELKQFLNSDYLLSGLGMNPDDSKKRLGDGLYEQRLIRDAVVARTGQRYIDGLSSDEALFRYLMDNAIAYKDQLHLQLGVGLSAEQMAALTHDIVWLEEVEVNGEKVLAPVVYLAQAEGRLAPNGALIQGRDVKLVSGGDLHNVGTLRARNDLSATADNLDNSGLIEAGKRLDLLAGDSIRNRQGGVIAGRDVSLTALTGDVINERSVTRYDSALDGRTWERSFADSAARVEAANSLNVQAGRDIANLGGVLQSRGDLSLDAGRDVTVAAVEDRQGQTRWSTSRLQSVTQLGAEVSAGRDLNVSAGRDLTAVASTLEARRDIALSAGRDVTLAAAANEEHAYSKTRKVTYQEDKVAQQGTRVDAGGDLAINAGQDLRLIASQASAGDEAYLVAGDKLELLAANDSNYYLYDKKKKGDFGRKETRRDEVTDVKAVGSQISSGGDLTLLSGGDQTYQGAKLESGNDLAIVSGGAVTFEAVKDLHQESHEKSKGDLAWNSAKGKGQTDETLRQTQIVAQGNLAIKAVEGLKIDLKHIDQKTVSQTIDAMVQADPQLAWLKEAEQRGDVDWRMVQEVHDSWKYSNSGMGPATQIAVAIAAAAIGGMAAAGALSGAGVGASSFAMGAGVGAAGSLSGTAAVSLINNKGDLGKVLKDSFSSDSLKQIAIASLTGGLTAEYFDGILQTKTDPLTGKVTVDLSSLSGVGRFAANQAMQNATSTVLSQALGQGGSLNEALKSALYNSFAAAGFNFVGDIGQEYSLKPGDPSMVTMHALMGGLAAQVSGGDFATGAAAAGANEALVAKLDQAFKSLSPENREAMVTMGSQLVGVLAAAVRDPDVTGKALESAAWVAKNSTQYNFLNHQDVADLDNALQKCKSQGNCRQVEEEFKARSDENRRRLNGCVAVGNCAEIRAEIDAGSTALNELVARQETANPGGSDSDIAYGFLMGRNVVDWTTAGQLHLEQTANLWWNGNPQWQKEVGAYLDQTGFNPFGIGVPAMGGAAGKVTAKALMNALKAGELPKGEVAPGKANLPTIGALADAEAGMPYTHPVKLAAKATGTAGKIKIEAGAIPDANEVRAGQGLSGLGYDVTHQTTASAKGIQGQRTADLHVDGLGSIDVYTPKNLDPTKIVRAIEKKSNQAGGVLVQADLPSTDMSSIAARMWGKTNAQSIKTIFFQKPDGSLVRFDRPAGGG', 'MDIRSPLNQCIALSLAGILFLNPIVAAAAGLALDKAAGGNTGLGQAGNGVPIVNIATPNGAGLSNNHFRDYNVGANGLILNNATGKTQGTQLGGIILGNPNLKGQAAQVILNQVTGGNRSTLAGYTEVAGQSARVIVANPHGITCQGCGFINTPRATLTTGKPIMDGQRLERFQVDGGDIVVEGAELNVGNLEQFDLITRSAKLNAKLYAKNLNIVTGRNDVQADSLQATPRAADGSEKPQLAIDSSALGGMYAGAIRLVGTEQGVGVKLAGDMAASGGDIRIDASGKLSLAQASSQGDLKIAAQAVELNGKTYAGGSAEIRSAEELVNRQSLAARERIALEAAHIDNAGVIEAGVEPDERRNARGDLELRSGTLRNAGSLVASRALEAKASQALDNQGGSLKGATVRVDGGHLDNRGGKLLAEGELRVEASSLDNRQDGLLQSRDRAVVKTRGDLDNRGGQVVGLNELQVQAAALDNRSAGLLSSKGDMDIEFARLDNSAGGKLVSERRTLLKADRLDNRSGRIVAGQDLDLSSRLIDNRAGDISSTSRVVASAREQLDNRGGKIVGDSGLDITTPRMLNQDKGVLASRDGLRLSATELFNGAGGLLSSQKGIDVSLAGAFDNQAGSLDSRGFLTVKSAWLDNQGGTLSSAGALAVTSQGALNNQGGRLASDAGLSLSSASLDNSQAGAISGKGAVEIRTGNLNNSRKASIGSDAGLTLVAARVDNSQAGRIAAKGVIDADLQGLDQHDRGNLVSDTGITLDLNKGSLVNRAQGLIATPGTLLLRQLGVVDNSGGEISSDRAFTLATSALNNQGGRLLSGGALTLRIAQALDNSLEGIVSGAGGLDIQAFVLDNRSGSIGSKGAIDIGVTRLENDAGTLIAERGLKLVADEANSSKGRIAANGSLHAKVGTLSQKGGELTSQDSLTLDLGILNNNAGRIAGNQGVDITARQVDNSVGEIASQGVVALNLTEQLDNRGGKIVGDSGLGITAPHVLNQDKGVLASRDGLRLSATELFNGAGGLLSSQKGIDVSLAGAFDNQAGSLDSRGFLTVKSAWLDNQGGTLSSAGALAVTSQGALNNQGGRLASDAGLSLSSASLDNSQAGAISGKGAVEIRTGNLNNSRKASIGSDAGLTLVAARVDNSQAGRIAAKGAIDAALQGLDQHDRGSLVSDTGITLDLNKGSLVNRAQGLIATPGTLLLRQLGVVDNSGGEISSDRAFTLATSALNNQGGRLLSGGALTLRIAQALDNSLEGIVSGAGGLDIQAFVLDNRSGSIGSKGAIDIGVTRLENDAGTLIAERGLKLAADEANNSKGRIVAKDELRAKLGALVQNGGELTTQGALALDADKVDNGAGRIAGNRGVVIDARQVDNRAGEIASQGVATLNLTEQLDNRGGKVVADSGLGITAPRVLNQDKGVIASRDGLRLSGTELFNGNAGLLSSQRHIEVTLDGVLDNQGKGALLSDGTLTVSAGRIHNQDATLSSAGALRLSSQEAVDNRGGKLVTDSSLRLTSASLDNSRSGIISANAAAEIHTGVLNNSQKGNLGSNDGLGLIATEVDNSQEGRITAKGMIDANIKGLDQQGKGRLVSNAGIILDLNEGTLANGAQGLIATPGTLLLRQLGMVDNSGGEISSDRAFTLTTSALTNQGGRLRSGGVLTLRIAQALDNSLEGVLSGTGGLDIRALALDNRSGSIGSKGAVDIDVSRLENDDGDLLSEGRLKLTAERANSVRGRIAARGDLHASVTAFNQAGGELSSEGALMLEADSLDNRSGGLVSADGNLTVSARRIDNRAGEIASPGQVTLDVAEQLDNRGGKAIGDSGLRLAAPRVLNQDGGVLASRDGLRLNGAELFNGNGGLLSSQQSIDVILDGVLGNQAGSLSSQGRLSVKSGRLDNQGGAVSSAGTLSLSSQGALNNQGGRVVTDAGAVLRSASLDNSQGGIVSAKGAAEIRTGSLNNSQKGGIGSGAGLALVADLVDNSQNGRITAKGAIDANLKGLDQQGSGRLVSDTAIALDLRGGELVNRAQGLIATPGALLLRQLGVVDNSGGGEISSDRSFTLAATALSNRGGRVISGDSLTLRIAQALDNSLQGVLSASGGLDVAALVFDNHSGIVASKGDTHIGVNRLENEAGRVVSEGALDLTAKQVSSAKGRIAAKGDLQVTVGTLEQQGGELASQGTLTLDADSLDNRNGGLVSADGGVTAEARQIDNRGGEISSVAKVALAVREQLDNRGGKVIGDSELSLTVQRLLNQAKGVLASRDGLHLDGAELLNGDGGLLSSQRLVDVTLSGALDNQGSGALVSEESLTVKADQVNNQAGTFSSAGSLLVTSRGELNNQGGRLVTDAGATLNSTGFDNSRAGLVSAKGAVAIRTGALNNSQKGSIGGNTGVTLVAGLVDNGREGRISTKGTLDANLKGLLQQGGGSLVGERGVTLDLNGGTLDNHDLGLVSTPGALLLRQLGMVDNSVGGEISSDRAFTLAANTLNNQGGRLISSEALTLRIAKTLDNSLKGQVLATDGLAIESQVLDNRAGTIGSKGDARISVTSLDNAEQGSLVSEGRLELVADQVSNGNQGRIAARGVLEAAVGTLLQQGGELVSQGSLDLRADTLDNSQSGLIAANGGIAIEARQVDNRAGEISSTSKVAVNAREQLDNRGGKVIGDSGLRLTVQRLLNQAKGVLAGRDGLSLDGGELFNGDGGRLDSQNSLSVSLGGVLDNQGGALVSEGSLTARAARLDNRGGTFSSAGALALTSQAVLDNQGGRLLSDAGVTLKGASLDNSRSGVISAKGAVDIRTGVLDNSRNGGIGSNAGITLVAARLDNGQQGRVSAKGLLDANLKGLDQRGGGVLVSETGVTLDLNGGTLVNRDGGLIATPGALLLRQLGAVDNGAGGEISSDRAFTLAAASLDNRGGRLIGADSLTLRIAQALDNSLAGVISGAAGLDIAAARLDNSAKGTLASRAGIDLRVDGALDNHAEGTVSGARLTLASASLDNSGKGLLSGNAGLSVATGALDNAEGGQLISQGVLDVSSADLDNRGGALSGKQSLRLSAANLDNRGGLLTSDGELELTAGRVDSADGGEISARGDLRLTVERLVQRQGRLIGERGVSLDLRGGDLDNQGGLISARGPLSIERLNVLDNRQGGEIYSQQGFELLARRIDNGQQGRIISAGKLRLDADALGNAGAGLLSGWQGLTVTGGSLDNSAGGTLSSKDGELAISLGGALDNHGQGALVSKGAQRIDAASLDNAQGIVSGESDVTLSIAGKLDNGQGGLVSAQRALSFERDDTLLNNAGGRINGGSLLLKGASLDNSDGQLISQGRLDAILGGALVNAGAARLASGGDLLLRSASVDNRGGKLVSQGLLEISAGSLDNSASGTLASQADMSLRLGGGALRNQQDGLIFSQAGALEVQAGSLDNRQGTLQAQGDNRLRIGGALDNQAGRLDSRAGNLDLQSGSLDNGAGGVLNSAKGWLKLVTGLFDNSAGVTQAQSLEIRAGQGVRNQQGHLSALGGDNRIVTADFDNQGGGLYASGLLSLDGQRFLNQGAAAGQGGKVGAGRIDFSLAGALANRFGQLESESELHLRAAAIDNSGGSLRALGRSGSTRLVAGDLNNAYGVLESANQDLDLQLGSLANAGGRILHTGNGTFGLDSGQVIRAGGELTTNGLLDIRASEWTNSSVLQAGRLNLDIGTFRQTAEGKLLAVQSFTGRGGDWSNDGLLASNGSLRLELSGGYRGNGRATSLGDFALNAASLDLGNAASLAGGANVTLGAGNLLVNRGRITAAGDLVASAASLNNYGTLGGGGNLRLNAPALLNERGLLFSGADMTLRAGDITNLYGDVYSLGRLDIARDDAGGWANRLENISGNLESTGDMRFSVSSLLNRRETLEIEGDLQNSAIGVRCTGCQLSERWGKTRSSSELVWIREYKSTLGDSSAAASITAGRDLLVVGASLQNIASNISAVRDATLSLSNFENKGYALGEYAVRGVYSPPSKFGEELLMRILAYNAVNDPSYGEGYASTGGRLPNIHYFDKNFNEKVSPLEVIHGNGKNGGPGWHLYFGTLDVEYPDTDRWNKAIGRIPAPNYSSKKTDAIPDLLKGLAPLDELTINKGANSTVGAVVQAGGRVTVNAAESFNNSVLQGFQAVQETQLPHQDIAVSSTTSAVVTLKSQLPADLARQQINPLTLPGFSLPQGQNGLFRLASQGAQVNQASGALKSASDLTQSGHGVSVSAQTGSGASGWSTQARRVGDDRVTSLAGSAYQGRVAEAIDALRASAPISGDGGNTGRFQAGEHQATTGLGGLVEGNASGHSGNGVILADLRGGLPSFSSLPASDHVQGTVPGHDGNGTILANWQGAQATVQASPSTVRVEGVVSSPGGNGSILADLPAEQSSVQALPSAVRAQGSLPRLEERSALLAEPPVGQPALQTLPSVARVEGVPSNATPSNSHKYLIETNPALTELKQFLNSDYLLGGLGINPDDSKKRLGDGLYEQRLVREAIVQRTGQRFIAGLNSDEAMFRYLMDNAIASKDVLGLTPGVTLSAAQVAALTHDIVWLEEVEVNGEKVLAPVVYLAQAEGRLGPNGALIQGRDVNLITGGDLRNAGTLRAQNDLSATAGNIDNSGLIEAGNRLDLLASGSIRNDQGGIIAGREVSLSALTGDVINERTVTQHQSSYRGTGTTEAFADSAARIEAAQKLTVSAGRDVANIGGVIDSKGDLALQGGRDVLVSAAVAERGWTAGSQAYQTQTTQMGAEVVAGRDISVSAGRDISVVGSRIDARRDVTFEAGRDVGLVAAANEEHAYGKTKKVTFQDDKITQQATRVDAGGDLAINAGQDLRLVASQASAGDEAYLVAGDKLELLAANDSSYYLYDKKSKGSFGSKKTRRDEITDVTAVGSQISSGGDLTLLSGGDQTYQGAKLESGNDLAIVSGGAVTFEAVKDLHQESHEKSKGDLAWQSSKGKGQTDETVRQSQIVAQGNLAIKAVEGLKIDLKHIDQKTVSQTIDAMVQADPQLAWLKQMEQRGDVDWRRVQELHDSWKYSNSGLGVGAQLAIAIVVAYFTAGAASAALGSMAGVGAGSGSMMAAAGSTAMVQAGTAVGTAAAGWANAAGTAVAMGMASNGAISTINNRGNLGDVVKDVTSSDALRGYVVAGTTAGLTAGVYDKWTSTQTGTSTALPNTGAVAPAAGLGTWQGVGQFTSNQLLQNGTSVLLDRALGGKGSLGDALQNSLANAFAAYGFKLIGDTTHGVLDDGSLGKIGLHALMGGLAAEAVGGDFRTGALAAGVNEALVDSLAKQYASLPIDDKKGLLIMSSQLIGVLAASTQGDADAKSLQTGAWVAGNATQHNYLSHWQEEKKRQEVDGCKDKQLCKTGIEAKWAIISAQQDVGIVVGVGGGIGLSTAETAVGVYELVKNWRETYAALEQLATSPEFRQQFGDNYLKGLEERAAFLTQAYEDAGWQGSVTAGVEGGRFAAELVGVLTAVKGGAQITAKLPTAAKNLVNAIAESPVSGSMSSQLGAVGDLGRLGGGGKGYVDILSHEAKQHILYGDKPGSGGHLWPGQAGKTVFPQNWSADKIVHEVGDIATSPSTKWYAQTGTGGVYTSKGDPAKWVAYEVRDGVRMRVVYQPATGKVITAFPDNAPIPPYKPIK']\n",
421
- "['CAC14227', 'P12255', 'P20471', 'A64556', 'AAF25576', 'Q4L9P0', 'Q9I5N6', 'Q9I791', 'Q9I120']\n"
422
- ]
423
- }
424
- ],
425
- "source": [
426
- "sequences = list(cpu_sequences['sequence'])\n",
427
- "print(sequences)\n",
428
- "accession = list(cpu_sequences['id'])\n",
429
- "print(accession)"
430
- ]
431
- },
432
- {
433
- "cell_type": "code",
434
- "execution_count": 8,
435
- "id": "2a1832cb",
436
- "metadata": {},
437
- "outputs": [
438
- {
439
- "data": {
440
- "application/vnd.jupyter.widget-view+json": {
441
- "model_id": "5df74f5eb4e24f72b645d0bbc1dc5c36",
442
- "version_major": 2,
443
- "version_minor": 0
444
- },
445
- "text/plain": [
446
- "Processing Sequences: 0%| | 0/9 [00:00<?, ?it/s]"
447
- ]
448
- },
449
- "metadata": {},
450
- "output_type": "display_data"
451
- }
452
- ],
453
- "source": [
454
- "# Setup device\n",
455
- "device = torch.device('cpu')\n",
456
- "\n",
457
- "# Load tokenizer and model\n",
458
- "tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)\n",
459
- "model = T5EncoderModel.from_pretrained(\"Rostlab/ProstT5\").to(device)\n",
460
- "model.full() if device == 'cpu' else model.half()\n",
461
- "\n",
462
- "# Clean sequences\n",
463
- "sequences = [\" \".join(list(re.sub(r\"[UZOB]\", \"X\", s))) for s in sequences]\n",
464
- "sequences = [ \"<AA2fold> \" + s for s in sequences]\n",
465
- "\n",
466
- "# Process each sequence individually\n",
467
- "for i, (seq, acc_id) in enumerate(tqdm(zip(sequences, accession), total=len(sequences), desc=\"Processing Sequences\")):\n",
468
- " try:\n",
469
- " # Tokenize\n",
470
- " ids = tokenizer(\n",
471
- " seq,\n",
472
- " add_special_tokens=True,\n",
473
- " return_tensors='pt'\n",
474
- " ).to(device)\n",
475
- "\n",
476
- " # Forward pass\n",
477
- " with torch.no_grad():\n",
478
- " embedding_repr = model(\n",
479
- " ids.input_ids,\n",
480
- " attention_mask=ids.attention_mask\n",
481
- " )\n",
482
- "\n",
483
- " # Compute actual length (excluding prefix)\n",
484
- " real_len = ids.attention_mask[0].sum().item() - 1\n",
485
- "\n",
486
- " # Extract and average embeddings\n",
487
- " emb = embedding_repr.last_hidden_state[0, 1:real_len]\n",
488
- " emb_avg = emb.mean(dim=0).cpu().numpy()\n",
489
- "\n",
490
- " # Save embedding using accession ID\n",
491
- " np.save(os.path.join(path, f\"{acc_id}.npy\"), emb_avg)\n",
492
- "\n",
493
- "\n",
494
- " # Cleanup\n",
495
- " del ids, embedding_repr, emb, emb_avg\n",
496
- " torch.cuda.empty_cache()\n",
497
- " gc.collect()\n",
498
- "\n",
499
- " except RuntimeError as e:\n",
500
- " print(f\"Error {e} mientras se procesaba {acc_id}\")\n",
501
- "\n"
502
- ]
503
- }
504
- ],
505
- "metadata": {
506
- "kernelspec": {
507
- "display_name": "tesisEnv",
508
- "language": "python",
509
- "name": "python3"
510
- },
511
- "language_info": {
512
- "codemirror_mode": {
513
- "name": "ipython",
514
- "version": 3
515
- },
516
- "file_extension": ".py",
517
- "mimetype": "text/x-python",
518
- "name": "python",
519
- "nbconvert_exporter": "python",
520
- "pygments_lexer": "ipython3",
521
- "version": "3.10.16"
522
- }
523
- },
524
- "nbformat": 4,
525
- "nbformat_minor": 5
526
- }
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ccbc986f147709315a8cb774f4c8523f9da34af321c111781e15e2d8c30c5f1
3
+ size 56006
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
notebooks/__pycache__/my_utils.cpython-310.pyc DELETED
Binary file (14.2 kB)
 
notebooks/hyperparamsRF.ipynb CHANGED
The diff for this file is too large to render. See raw diff
 
src/__init__.py ADDED
File without changes
src/cli.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tkinter as tk
2
+ from tkinter import Menu
3
+ from src.my_utils import predict_with_prost
4
+
5
+
6
+ def menu():
7
+ """
8
+ Creates and displays the main GUI menu for the Protein Tools application using Tkinter.
9
+
10
+ The menu includes:
11
+ - A "File" menu with options for creating a new file, opening, closing, preferences (with sub-menu for keyboard shortcuts and color themes), and exiting the application.
12
+ - A "Help" menu with options for welcome and about dialogs.
13
+ - Two buttons below the menu: one for loading a FASTA file (triggers `predict_with_prost`), and one for exiting the application.
14
+
15
+ Returns:
16
+ None
17
+ """
18
+ # root window
19
+ root = tk.Tk()
20
+ root.geometry('320x200')
21
+ root.title('Protein Tools Menu')
22
+
23
+ # create a menubar
24
+ menubar = Menu(root)
25
+ root.config(menu=menubar)
26
+
27
+ # create the file_menu
28
+ file_menu = Menu(menubar, tearoff=0)
29
+ file_menu.add_command(label='New')
30
+ file_menu.add_command(label='Open...')
31
+ file_menu.add_command(label='Close')
32
+ file_menu.add_separator()
33
+
34
+ sub_menu = Menu(file_menu, tearoff=0)
35
+ sub_menu.add_command(label='Keyboard Shortcuts')
36
+ sub_menu.add_command(label='Color Themes')
37
+
38
+ file_menu.add_cascade(label="Preferences", menu=sub_menu)
39
+ file_menu.add_separator()
40
+ file_menu.add_command(label='Exit', command=root.destroy)
41
+ menubar.add_cascade(label="File", menu=file_menu, underline=0)
42
+
43
+ # help menu
44
+ help_menu = Menu(menubar, tearoff=0)
45
+ help_menu.add_command(label='Welcome')
46
+ help_menu.add_command(label='About...')
47
+ menubar.add_cascade(label="Help", menu=help_menu, underline=0)
48
+
49
+ # =========================
50
+ # Add Buttons Below Menu
51
+ # =========================
52
+
53
+ btn_fasta = tk.Button(root, text="Load FASTA", command=predict_with_prost)
54
+ btn_fasta.pack(pady=5)
55
+
56
+ btn_exit = tk.Button(root, text="Exit", command=root.quit)
57
+ btn_exit.pack(pady=5)
58
+
59
+ root.mainloop()
60
+
61
+
62
+
63
+ menu()
{notebooks → src}/my_utils.py RENAMED
@@ -5,7 +5,9 @@ from pprint import pprint
5
  from io import StringIO
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
  from urllib.error import HTTPError
8
- from typing import Literal
 
 
9
 
10
 
11
  import pandas as pd
@@ -27,7 +29,7 @@ import umap
27
  import requests
28
  from Bio import Entrez
29
  from Bio import SeqIO
30
- from tqdm.notebook import tqdm
31
 
32
  # Visualization libraries
33
  import seaborn as sns
@@ -36,7 +38,9 @@ import plotly.express as px
36
 
37
  from esm.models.esmc import ESMC
38
  from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
39
- from transformers import T5Tokenizer, T5EncoderModel
 
 
40
 
41
  import torch
42
  import gc
@@ -44,7 +48,18 @@ import gc
44
 
45
 
46
  # Load one chunk of embeddings
47
- def load_emb(path: str, acc: list[str])->list[np.array]:
 
 
 
 
 
 
 
 
 
 
 
48
  X = []
49
  for a in tqdm(acc, desc = 'Cargando embeddings'):
50
  emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
@@ -59,7 +74,14 @@ def load_emb(path: str, acc: list[str])->list[np.array]:
59
  X.append(emb)
60
  return X
61
 
62
- def confusion(title : str, y_true: np.array, y_pred: np.array) -> None:
 
 
 
 
 
 
 
63
 
64
  cm = confusion_matrix(y_true = y_true,
65
  y_pred = y_pred,
@@ -77,27 +99,46 @@ def confusion(title : str, y_true: np.array, y_pred: np.array) -> None:
77
  plt.show()
78
 
79
  def perplexity(X):
 
 
 
 
 
 
 
80
  X_array = np.vstack(X)
81
  perp= np.arange(5, 55, 5)
82
  divergence = []
83
 
84
  for i in perp:
85
  model = TSNE(n_components=2, init="pca", perplexity=i)
86
- reduced = model.fit_transform(X_array)
87
  divergence.append(model.kl_divergence_)
88
  fig = px.line(x=perp, y=divergence, markers=True)
89
  fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
90
  fig.update_traces(line_color="red", line_width=1)
91
  fig.show()
92
 
93
- def plot_umap(X: list[np.array], y: list[str], title: str, org : list[str]) -> None:
 
 
 
 
 
 
 
 
 
 
 
 
94
  reducer = umap.UMAP(n_neighbors=30, random_state=42)
95
- X_array = np.vstack(X)
96
 
97
- scaled_X = StandardScaler().fit_transform(X_array)
98
- embedding = reducer.fit_transform(scaled_X)
 
99
 
100
- fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=y, hover_data= [org, y])
101
  fig.update_layout(
102
  title=title,
103
  xaxis_title="First UMAP",
@@ -106,7 +147,7 @@ def plot_umap(X: list[np.array], y: list[str], title: str, org : list[str]) -> N
106
  fig.show()
107
 
108
 
109
- def plot_PCA(X: np.array, labels: list[str], title: str, org : list[str], scale: bool) -> None:
110
  X_array = np.vstack(X)
111
  pca = PCA(n_components=2, random_state=42)
112
 
@@ -133,22 +174,33 @@ def plot_PCA(X: np.array, labels: list[str], title: str, org : list[str], scale:
133
  fig.show()
134
 
135
 
136
- def tsne_plot(X, y, org : list[str]) -> None:
137
- X_array = np.vstack(StandardScaler().fit_transform(X))
 
 
 
 
 
138
  tsne = TSNE(n_components=2, perplexity=60, random_state=42)
139
  tsne_fit = tsne.fit_transform(X_array)
140
 
141
- fig = px.scatter(x=tsne_fit[:, 0], y=tsne_fit[:, 1], color=y, hover_data= [org, y])
142
  fig.update_layout(
143
  title="t-SNE",
144
  xaxis_title="First t-SNE",
145
- yaxis_title="Second t-SNE",
146
  )
147
-
148
  fig.show()
149
-
150
-
151
  def plot_emb(X, y, model_name, org : list[str]):
 
 
 
 
 
 
 
 
 
152
  print(f"Plotting embeddings for: {model_name}")
153
  plot_PCA(X, y, title="PCA", scale=True, org = org)
154
  tsne_plot(X, y,org = org)
@@ -225,7 +277,7 @@ def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[P
225
  return pipeline, evaluation
226
 
227
 
228
- def randomSVM(X: np.array, y = np.array) -> dict:
229
 
230
  X_train, _, y_train, _ = train_test_split(X,
231
  y,
@@ -336,8 +388,24 @@ def gridSearch(X: np.ndarray, y: np.ndarray, grid: dict):
336
 
337
 
338
  def fetch_uniprot_sequence(uniprot_id: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
339
  url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
340
- response = requests.get(url)
341
 
342
  if response.status_code == 200:
343
  try:
@@ -346,10 +414,10 @@ def fetch_uniprot_sequence(uniprot_id: str):
346
  record = SeqIO.read(fasta_io, "fasta")
347
  return str(record.seq)
348
 
349
- except Exception:
350
  # fallback to UniSave if the standard endpoint is not available
351
  url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
352
- response = requests.get(url)
353
 
354
  if response.status_code == 200:
355
  try:
@@ -358,7 +426,7 @@ def fetch_uniprot_sequence(uniprot_id: str):
358
  fasta_io = StringIO(entries[1])
359
  record = SeqIO.read(fasta_io, "fasta")
360
  return str(record.seq)
361
- except Exception:
362
  print(f'No se pudo obtener la entrada FASTA para {uniprot_id} desde UniSave')
363
  else:
364
  print(f'UniSave URL inválido: {url}')
@@ -372,11 +440,13 @@ def fetch_refseq_sequence(refseq_id : str):
372
  """
373
 
374
  Entrez.email = "puglia.jd@gmail.com" # REQUIRED
375
- Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
376
  # Check if the ID is NaN or None
377
  if pd.isna(refseq_id) or refseq_id is None:
378
  return None
379
-
 
 
380
  try:
381
  handle = Entrez.efetch(
382
  db="protein",
@@ -434,7 +504,9 @@ def _fetch_sequence_for_row(idx, row):
434
  return idx, sequence
435
 
436
 
437
- def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: int = None, max_workers: int = 5) -> pd.DataFrame:
 
 
438
  """
439
  Add a 'sequence' column to the dataframe by fetching sequences from
440
  SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
@@ -481,7 +553,7 @@ def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: int = None, max_
481
  f"({round(success_count/total_rows*100, 2)}%)")
482
  return result_df
483
 
484
- def esm_embed_sequence(model : Literal["esmc_300m", "esmc_600m"], sequence : str, device : str) -> None:
485
 
486
  """
487
  Embed a protein sequence using the specified ESM model.
@@ -524,7 +596,9 @@ def esm_save_emb(model: Literal["esmc_300m", "esmc_600m"],
524
  assert len(seq_list) == len(id_list), "Sequence and ID lists must be the same length."
525
  os.makedirs(path, exist_ok=True)
526
 
527
- for i, (seq, acc) in enumerate(tqdm(zip(seq_list, id_list), total=len(seq_list), desc="Saving embeddings")):
 
 
528
  try:
529
  output: LogitsOutput = esm_embed_sequence(model=model, sequence=seq, device = device)
530
  emb_array = output.embeddings.cpu().numpy()
@@ -543,65 +617,218 @@ def esm_save_emb(model: Literal["esmc_300m", "esmc_600m"],
543
  gc.collect()
544
  torch.cuda.empty_cache()
545
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
546
 
547
- def prost_embed_sequence(seq_list: list[str],
548
- acc_list: list[str],
549
- path: str,
550
- device : Literal["cuda:0", "cpu"] = "cuda:0") -> None:
 
 
551
 
552
- """
 
553
 
554
- Embed protein sequences using ProstT5 and save embeddings.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
555
  Args:
556
- model_name: Name of the ProstT5 model to use.
557
- seq_list: List of protein sequences to embed.
558
- acc_list: List of identifiers corresponding to the sequences.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
559
 
 
 
 
560
  """
 
 
 
 
561
 
562
- assert len(seq_list) == len(acc_list), "Sequence and ID lists must match"
 
 
563
 
564
- os.makedirs(path, exist_ok=True)
 
 
 
 
565
 
566
- device = torch.device(device)
 
 
 
 
 
567
 
568
- tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5")
569
- model = T5EncoderModel.from_pretrained("Rostlab/ProstT5").to(device)
570
- model = model.full() if device.type == 'cpu' else model.half()
571
- model.eval()
 
572
 
573
- for i, (seq, acc_id) in enumerate(tqdm(zip(seq_list, acc_list), total=len(seq_list), desc="Processing Sequences")):
574
- try:
575
- # Tokenize
576
- ids = tokenizer(
577
- seq,
578
- add_special_tokens=True,
579
- return_tensors='pt'
580
- ).to(device)
581
-
582
- # Forward pass
583
- with torch.no_grad():
584
- embedding_repr = model(
585
- ids.input_ids,
586
- attention_mask=ids.attention_mask
587
- )
588
 
589
- real_len = ids.attention_mask[0].sum().item() - 1
590
- if real_len <= 0:
591
- print(f"Sequence too short after tokenization for {acc_id}")
592
- continue
 
593
 
594
- emb = embedding_repr.last_hidden_state[0, 1:real_len]
595
- emb_avg = emb.mean(dim=0).cpu().numpy()
 
596
 
597
- np.save(os.path.join(path, f"{acc_id}.npy"), emb_avg)
598
 
599
- del ids, embedding_repr, emb, emb_avg
 
 
 
 
 
 
600
 
601
- except RuntimeError as e:
602
- print(f"RuntimeError while processing {acc_id}: {e}")
603
 
604
- if i % 100 == 0:
605
- gc.collect()
606
- torch.cuda.empty_cache()
 
 
 
 
 
 
 
607
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  from io import StringIO
6
  from concurrent.futures import ThreadPoolExecutor, as_completed
7
  from urllib.error import HTTPError
8
+ from typing import Literal, Optional
9
+ import tkinter as tk
10
+ from tkinter import filedialog
11
 
12
 
13
  import pandas as pd
 
29
  import requests
30
  from Bio import Entrez
31
  from Bio import SeqIO
32
+ from tqdm import tqdm
33
 
34
  # Visualization libraries
35
  import seaborn as sns
 
38
 
39
  from esm.models.esmc import ESMC
40
  from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
41
+ from transformers import T5Tokenizer, T5EncoderModel, PreTrainedModel
42
+
43
+ from joblib import load
44
 
45
  import torch
46
  import gc
 
48
 
49
 
50
  # Load one chunk of embeddings
51
+ def load_emb(path: str, acc: list[str])->list[np.ndarray]:
52
+
53
+ """ Load embeddings from a specified path.
54
+ Args:
55
+ path (str): Directory where embeddings are stored.
56
+ acc (list[str]): List of accession IDs corresponding to the embeddings.
57
+ Returns:
58
+ list[np.ndarray]: List of loaded embeddings as numpy arrays.
59
+ """
60
+ if not os.path.exists(path):
61
+ raise FileNotFoundError(f"The specified path does not exist: {path}")
62
+
63
  X = []
64
  for a in tqdm(acc, desc = 'Cargando embeddings'):
65
  emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
 
74
  X.append(emb)
75
  return X
76
 
77
+ def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
78
+
79
+ """ Plot a confusion matrix for the given true and predicted labels.
80
+ Args:
81
+ title (str): Title for the confusion matrix plot.
82
+ y_true (np.ndarray): True labels.
83
+ y_pred (np.ndarray): Predicted labels.
84
+ """
85
 
86
  cm = confusion_matrix(y_true = y_true,
87
  y_pred = y_pred,
 
99
  plt.show()
100
 
101
  def perplexity(X):
102
+
103
+ """
104
+ Plot the KL divergence for different perplexity values in t-SNE.
105
+ Args:
106
+ X (list[np.ndarray]): List of feature arrays to be reduced.
107
+ """
108
+
109
  X_array = np.vstack(X)
110
  perp= np.arange(5, 55, 5)
111
  divergence = []
112
 
113
  for i in perp:
114
  model = TSNE(n_components=2, init="pca", perplexity=i)
 
115
  divergence.append(model.kl_divergence_)
116
  fig = px.line(x=perp, y=divergence, markers=True)
117
  fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
118
  fig.update_traces(line_color="red", line_width=1)
119
  fig.show()
120
 
121
+ def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) -> None:
122
+ """
123
+ Plot a 2D UMAP projection of high-dimensional data with color-coded labels and hover information.
124
+
125
+ Args:
126
+ x (list[np.ndarray]): List of feature arrays to be concatenated and visualized.
127
+ y (list[str]): List of labels corresponding to each sample in x, used for coloring the scatter plot.
128
+ title (str): Title of the plot.
129
+ org (list[str]): List of organism or group identifiers for each sample, shown in hover data.
130
+
131
+ Returns:
132
+ None: Displays an interactive UMAP scatter plot using Plotly.
133
+ """
134
  reducer = umap.UMAP(n_neighbors=30, random_state=42)
135
+ x_array = np.vstack(x)
136
 
137
+ scaled_x = StandardScaler().fit_transform(x_array)
138
+ embedding = reducer.fit_transform(scaled_x)
139
+ embedding = np.array(embedding) # Ensure it's a NumPy array for slicing
140
 
141
+ fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=y, hover_data=[org, y])
142
  fig.update_layout(
143
  title=title,
144
  xaxis_title="First UMAP",
 
147
  fig.show()
148
 
149
 
150
+ def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
151
  X_array = np.vstack(X)
152
  pca = PCA(n_components=2, random_state=42)
153
 
 
174
  fig.show()
175
 
176
 
177
+ def tsne_plot(X, y, org: list[str]) -> None:
178
+ # If X is a list of arrays, stack them; if already ndarray, use as is
179
+ if isinstance(X, list):
180
+ X_array = np.vstack(X)
181
+ else:
182
+ X_array = X
183
+ X_array = StandardScaler().fit_transform(X_array)
184
  tsne = TSNE(n_components=2, perplexity=60, random_state=42)
185
  tsne_fit = tsne.fit_transform(X_array)
186
 
187
+ fig = px.scatter(x=tsne_fit[:, 0], y=tsne_fit[:, 1], color=y, hover_data=[org, y])
188
  fig.update_layout(
189
  title="t-SNE",
190
  xaxis_title="First t-SNE",
191
+ yaxis_title="Second t-SNE"
192
  )
 
193
  fig.show()
 
 
194
  def plot_emb(X, y, model_name, org : list[str]):
195
+
196
+ """ Plot embeddings using PCA, t-SNE, and UMAP.
197
+ Args:
198
+ X (list[np.ndarray]): List of feature arrays to be concatenated and visualized.
199
+ y (list[str]): List of labels corresponding to each sample in X, used for coloring the scatter plot.
200
+ model_name (str): Name of the model used for generating embeddings.
201
+ org (list[str]): List of organism or group identifiers for each sample, shown in hover data.
202
+ """
203
+
204
  print(f"Plotting embeddings for: {model_name}")
205
  plot_PCA(X, y, title="PCA", scale=True, org = org)
206
  tsne_plot(X, y,org = org)
 
277
  return pipeline, evaluation
278
 
279
 
280
+ def randomSVM(X: np.ndarray, y = np.ndarray) -> dict:
281
 
282
  X_train, _, y_train, _ = train_test_split(X,
283
  y,
 
388
 
389
 
390
  def fetch_uniprot_sequence(uniprot_id: str):
391
+
392
+ """
393
+ Fetch the protein sequence for the given UniProt ID.
394
+ Returns the raw amino-acid sequence as a string.
395
+ Args:
396
+ uniprot_id: UniProt ID to fetch the sequence for.
397
+ Returns:
398
+ str: Amino-acid sequence in FASTA format.
399
+ Raises:
400
+ HTTPError: If the request to UniProt fails.
401
+ Note:
402
+ This function first tries to fetch the sequence from the standard UniProt endpoint.
403
+ If that fails, it falls back to the UniSave endpoint.
404
+ If both fail, it returns None and prints an error message.
405
+ """
406
+
407
  url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
408
+ response = requests.get(url, timeout=10)
409
 
410
  if response.status_code == 200:
411
  try:
 
414
  record = SeqIO.read(fasta_io, "fasta")
415
  return str(record.seq)
416
 
417
+ except ValueError:
418
  # fallback to UniSave if the standard endpoint is not available
419
  url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
420
+ response = requests.get(url, timeout=10)
421
 
422
  if response.status_code == 200:
423
  try:
 
426
  fasta_io = StringIO(entries[1])
427
  record = SeqIO.read(fasta_io, "fasta")
428
  return str(record.seq)
429
+ except ValueError:
430
  print(f'No se pudo obtener la entrada FASTA para {uniprot_id} desde UniSave')
431
  else:
432
  print(f'UniSave URL inválido: {url}')
 
440
  """
441
 
442
  Entrez.email = "puglia.jd@gmail.com" # REQUIRED
443
+ Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
444
  # Check if the ID is NaN or None
445
  if pd.isna(refseq_id) or refseq_id is None:
446
  return None
447
+
448
+ fasta_data = None
449
+
450
  try:
451
  handle = Entrez.efetch(
452
  db="protein",
 
504
  return idx, sequence
505
 
506
 
507
+
508
+
509
+ def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: Optional[int] = None, max_workers: int = 5) -> pd.DataFrame:
510
  """
511
  Add a 'sequence' column to the dataframe by fetching sequences from
512
  SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
 
553
  f"({round(success_count/total_rows*100, 2)}%)")
554
  return result_df
555
 
556
+ def esm_embed_sequence(model : Literal["esmc_300m", "esmc_600m"], sequence : str, device : str) -> LogitsOutput:
557
 
558
  """
559
  Embed a protein sequence using the specified ESM model.
 
596
  assert len(seq_list) == len(id_list), "Sequence and ID lists must be the same length."
597
  os.makedirs(path, exist_ok=True)
598
 
599
+ for i, (seq, acc) in enumerate(
600
+ tqdm(zip(seq_list, id_list),
601
+ total=len(seq_list), desc="Saving embeddings")):
602
  try:
603
  output: LogitsOutput = esm_embed_sequence(model=model, sequence=seq, device = device)
604
  emb_array = output.embeddings.cpu().numpy()
 
617
  gc.collect()
618
  torch.cuda.empty_cache()
619
 
620
+ def prost_embed_sequence(seq : str,
621
+ acc : str,
622
+ tokenizer : T5Tokenizer,
623
+ model : PreTrainedModel,
624
+ device : torch.device = torch.device(
625
+ 'cuda:0'
626
+ if torch.cuda.is_available()
627
+ else 'cpu'
628
+ ))-> Optional[np.ndarray]:
629
+
630
+ """
631
+ Embeds a protein sequence using the ProstT5 model and returns the averaged embedding as a NumPy array.
632
+ Args:
633
+ seq (str): The amino acid sequence to embed. Non-standard amino acids (U, Z, O, B) are replaced with 'X'.
634
+ acc (str): Accession or identifier for the sequence, used for logging.
635
+ device (torch.device, optional): The device to run the model on. Defaults to CUDA if available, otherwise CPU.
636
+ Returns:
637
+ Optional[np.ndarray]: The averaged embedding vector for the input sequence, or None if an error occurs.
638
+ Notes:
639
+ - Uses half-precision on GPU and full precision on CPU for efficiency.
640
+ - Returns None and prints an error message if the sequence is too short or if a runtime/value error occurs.
641
+ """
642
+
643
+ model = model.to(device) #type: ignore
644
+ model = model.half() if str(device) != 'cpu' else model.float() # Use half precision for GPU, full precision for CPU
645
 
646
+ seq = re.sub(r"[UZOB]", "X", seq) # Replace non-standard amino acids with 'X'
647
+ seq = " ".join(list(seq)) # Space-separate amino acids for ProstT5
648
+
649
+ try:
650
+ #Tokenize the sequence
651
+ ids = tokenizer(seq, add_special_tokens=True, return_tensors='pt')
652
 
653
+ # Move tensors to device after tokenization
654
+ ids = {k: v.to(device) for k, v in ids.items()}
655
 
656
+ with torch.no_grad():
657
+ # Forward pass through the model
658
+ with torch.no_grad():
659
+ embedding_repr = model(
660
+ ids['input_ids'],
661
+ attention_mask=ids['attention_mask']
662
+ )
663
+
664
+ real_len = ids['attention_mask'][0].sum().item() - 1 # Exclude start token
665
+
666
+ if real_len <= 0:
667
+ print(f"Sequence too short after tokenization for {acc}")
668
+
669
+ # Extract and average embeddings
670
+
671
+ emb = embedding_repr.last_hidden_state[0, 1:real_len]
672
+ emb_avg = emb.mean(dim=0).cpu().numpy()
673
+
674
+ return emb_avg
675
+
676
+ except RuntimeError as e:
677
+ print(f"RuntimeError while processing {acc}: {e}")
678
+ return None
679
+ except ValueError as e:
680
+ print(f"ValueError while processing {acc}: {e}")
681
+ return None
682
+
683
+ def fasta_to_seq(fasta_file: str) -> Optional[tuple[list[str], list[str]]]:
684
+ """
685
+ Reads a FASTA file and extracts the sequences as a list of strings.
686
  Args:
687
+ fasta_file (str): Path to the FASTA file to be read.
688
+ Returns:
689
+ list[str]: A list containing the sequences from the FASTA file as strings.
690
+ Returns an empty list if there is an error reading the file.
691
+ Raises:
692
+ ValueError: If the file cannot be parsed as FASTA.
693
+ """
694
+
695
+ sequences = []
696
+ ids = []
697
+
698
+ with open(fasta_file, 'r', encoding='utf-8') as f:
699
+
700
+ try:
701
+
702
+ for record in SeqIO.parse(f, "fasta"):
703
+ sequences.append(str(record.seq))
704
+ ids.append(str(record.id))
705
+ return sequences, ids
706
+
707
+ except ValueError as e:
708
+ print(f"Error reading {fasta_file}: {e}")
709
+ return None
710
+
711
+ def save_predictions_to_txt(predictions_dict: dict[str, tuple[list[str], list[float]]],
712
+ output_file: str) -> None:
713
+ """
714
+ Save predictions to a text file in the specified format.
715
 
716
+ Args:
717
+ predictions_dict: Dictionary with sequence_id as key and (class_names, probabilities) as value
718
+ output_file: Path to the output text file
719
  """
720
+ with open(output_file, 'w', encoding='utf-8') as f:
721
+ f.write("Sequence_ID,Predictions\n") # Header
722
+
723
+ for seq_id, (class_names, probabilities) in predictions_dict.items():
724
 
725
+ # Create pairs of (class_name, probability) and sort by probability (descending)
726
+ class_prob_pairs = list(zip(class_names, probabilities))
727
+ class_prob_pairs.sort(key=lambda x: x[1], reverse=True)
728
 
729
+ # Create the prediction string: "Class1 (prob1), Class2 (prob2), ..."
730
+ pred_strings = [f"{cls} ({prob:.4f})" for cls, prob in zip(class_names, probabilities)]
731
+ pred_line = ", ".join(pred_strings)
732
+
733
+ f.write(f"{seq_id},{pred_line}\n")
734
 
735
+ def predict_with_prost():
736
+ """
737
+ Function to select a directory containing FASTA files and embed sequences using ProstT5.
738
+ """
739
+ root = tk.Tk()
740
+ root.withdraw()
741
 
742
+ fasta_path : str = filedialog.askopenfilename(
743
+ title="Select a FASTA file",
744
+ filetypes=[("FASTA files", "*.fasta *.fa")],
745
+ initialdir="."
746
+ )
747
 
748
+ if not fasta_path:
749
+ print("No file selected.")
750
+ return
 
 
 
 
 
 
 
 
 
 
 
 
751
 
752
+ # Select output directory for results
753
+ output_dir: str = filedialog.askdirectory(
754
+ title="Select output directory for results",
755
+ initialdir="."
756
+ )
757
 
758
+ if not output_dir:
759
+ print("No output directory selected.")
760
+ return
761
 
762
+ result = fasta_to_seq(fasta_path)
763
 
764
+ if result is None:
765
+ print("No sequences found in the FASTA file.")
766
+ return {}
767
+ else:
768
+ sequences, ids = result
769
+ print(f"Sequences loaded from {fasta_path}: {len(sequences)} sequences found.")
770
+ print("Embedding sequences using ProstT5...")
771
 
772
+ tokenizer : T5Tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False)
773
+ model : PreTrainedModel = T5EncoderModel.from_pretrained("Rostlab/ProstT5")
774
 
775
+ embeddings : dict[str, np.ndarray] = {}
776
+
777
+ for seq, acc in tqdm(zip(sequences, ids), total=len(sequences), desc="Embedding sequences"):
778
+ emb = prost_embed_sequence(seq, acc, tokenizer, model)
779
+ if emb is not None:
780
+ embeddings[acc] = emb
781
+ else:
782
+ print(f"Failed to embed sequence {acc}. Skipping.")
783
+
784
+ print(f"Embedded {len(embeddings)} sequences successfully.")
785
 
786
+ print("Loading pre-trained SVM model for prediction...")
787
+ try:
788
+ predictor = load('/home/juan/ProteinLocationPredictor/ProteinLocationPredictor/Models/rfProst.joblib')
789
+ except FileNotFoundError:
790
+ print("Error: Could not find the model file '../ProteinLocationPredictor/Models/svmProst.joblib'")
791
+ print("Please check the path to your trained model.")
792
+ return
793
+
794
+ sequence_ids = list(embeddings.keys())
795
+ X = np.array(list(embeddings.values())) #type: ignore
796
+ print("Making predictions...")
797
+ y_pred_proba = predictor.predict_proba(X)
798
+
799
+ # Get class names (you may need to adjust this based on your model)
800
+ if hasattr(predictor, 'classes_'):
801
+ class_names = predictor.classes_.tolist()
802
+ else:
803
+ # If class names are not available, use generic names
804
+ n_classes = y_pred_proba.shape[1]
805
+ class_names = [f"Class_{i}" for i in range(n_classes)]
806
+
807
+ # Convert class names to strings if they aren't already
808
+ class_names = [str(cls) for cls in class_names]
809
+
810
+ # Create predictions dictionary
811
+ predictions_dict = {}
812
+ for i, seq_id in enumerate(sequence_ids):
813
+ probabilities = y_pred_proba[i].tolist()
814
+ class_prob_pairs = sorted(zip(class_names, probabilities), key=lambda x: x[1], reverse=True)
815
+ sorted_classes, sorted_probs = zip(*class_prob_pairs)
816
+ predictions_dict[seq_id] = (list(sorted_classes), list(sorted_probs))
817
+
818
+ # Generate output filename
819
+ input_filename = os.path.splitext(os.path.basename(fasta_path))[0]
820
+ output_file = os.path.join(output_dir, f"{input_filename}_predictions.txt")
821
+
822
+ # Save predictions to file
823
+ print(f"Saving predictions to {output_file}...")
824
+ save_predictions_to_txt(predictions_dict, output_file)
825
+
826
+ print(f"Predictions saved successfully!")
827
+ print(f"Total sequences processed: {len(embeddings)}")
828
+ print(f"Output file: {output_file}")
829
+
830
+ # Print a few sample predictions
831
+ print("\nSample predictions:")
832
+ for i, (seq_id, (classes, probs)) in enumerate(list(predictions_dict.items())[:3]):
833
+ pred_str = ", ".join([f"{cls} ({prob:.4f})" for cls, prob in zip(classes, probs)])
834
+ print(f"{seq_id}: {pred_str}")