Created CLI prototype, rearregement of files
Browse files- .gitattributes +2 -45
- .gitignore +5 -1
- Data/idmapping_2025_06_24_predictions.txt +6 -0
- {ProteinLocationPredictor/Models → Models}/rfESM300.joblib +0 -0
- {ProteinLocationPredictor/Models → Models}/rfESM600.joblib +0 -0
- {ProteinLocationPredictor/Models → Models}/rfProst.joblib +0 -0
- {ProteinLocationPredictor/Models → Models}/svm300.joblib +0 -0
- {ProteinLocationPredictor/Models → Models}/svmESM600.joblib +0 -0
- {ProteinLocationPredictor/Models → Models}/svmProst.joblib +0 -0
- ProteinLocationPredictor/.gitattributes +0 -35
- ProteinLocationPredictor/README.md +0 -3
- notebooks/EDA_Psort.ipynb +0 -0
- notebooks/ESMC_300m.ipynb +3 -421
- notebooks/ESMC_600m.ipynb +3 -256
- notebooks/EmbAnalisis.ipynb +2 -2
- notebooks/ProstT5.ipynb +3 -526
- notebooks/__pycache__/my_utils.cpython-310.pyc +0 -0
- notebooks/hyperparamsRF.ipynb +0 -0
- src/__init__.py +0 -0
- src/cli.py +63 -0
- {notebooks → src}/my_utils.py +300 -73
.gitattributes
CHANGED
|
@@ -1,45 +1,2 @@
|
|
| 1 |
-
*.
|
| 2 |
-
*.
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
-
Plots/Embeddings/PCA_ESM300m.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
-
Plots/Embeddings/PCA_ESM600m.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
-
Plots/Embeddings/PCA_ProstT5.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
-
Plots/Embeddings/UMAP_ESM300m.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
-
Plots/Embeddings/UMAP_ESM600m.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
-
Plots/Embeddings/UMAP_ProstT5.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
-
Plots/Embeddings/t-SNE_ESM300m.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
-
Plots/Embeddings/t-SNE_ESM600m.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
-
Plots/Embeddings/t-SNE_ProstT5.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
-
notebooks/EmbAnalisis.ipynb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 1 |
+
Models/*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 2 |
+
notebooks/*.ipynb filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
|
@@ -1 +1,5 @@
|
|
| 1 |
-
*.npy
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
*.npy
|
| 2 |
+
*.tab
|
| 3 |
+
__pycache__/
|
| 4 |
+
*.pyc
|
| 5 |
+
/home/juan/ProteinLocationPredictor/notebooks/__pycache__
|
Data/idmapping_2025_06_24_predictions.txt
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Sequence_ID,Predictions
|
| 2 |
+
sp|P0A7V8|RS4_ECOLI,Cytoplasmic (0.5908), CytoplasmicMembrane (0.2121), Periplasmic (0.1080), Extracellular (0.0750), OuterMembrane (0.0140), Cellwall (0.0000)
|
| 3 |
+
sp|P0A910|OMPA_ECOLI,OuterMembrane (0.9844), CytoplasmicMembrane (0.0069), Extracellular (0.0037), Cytoplasmic (0.0028), Periplasmic (0.0021), Cellwall (0.0000)
|
| 4 |
+
sp|P0A6F5|CH60_ECOLI,Cytoplasmic (0.7449), CytoplasmicMembrane (0.1760), Periplasmic (0.0376), Extracellular (0.0267), OuterMembrane (0.0145), Cellwall (0.0003)
|
| 5 |
+
sp|P02930|TOLC_ECOLI,OuterMembrane (0.9672), CytoplasmicMembrane (0.0185), Extracellular (0.0059), Periplasmic (0.0048), Cytoplasmic (0.0036), Cellwall (0.0000)
|
| 6 |
+
tr|Q9L1T3|Q9L1T3_STRCO,CytoplasmicMembrane (0.7330), Cytoplasmic (0.0996), Periplasmic (0.0820), Extracellular (0.0585), OuterMembrane (0.0260), Cellwall (0.0009)
|
{ProteinLocationPredictor/Models → Models}/rfESM300.joblib
RENAMED
|
File without changes
|
{ProteinLocationPredictor/Models → Models}/rfESM600.joblib
RENAMED
|
File without changes
|
{ProteinLocationPredictor/Models → Models}/rfProst.joblib
RENAMED
|
File without changes
|
{ProteinLocationPredictor/Models → Models}/svm300.joblib
RENAMED
|
File without changes
|
{ProteinLocationPredictor/Models → Models}/svmESM600.joblib
RENAMED
|
File without changes
|
{ProteinLocationPredictor/Models → Models}/svmProst.joblib
RENAMED
|
File without changes
|
ProteinLocationPredictor/.gitattributes
DELETED
|
@@ -1,35 +0,0 @@
|
|
| 1 |
-
*.7z filter=lfs diff=lfs merge=lfs -text
|
| 2 |
-
*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 3 |
-
*.bin filter=lfs diff=lfs merge=lfs -text
|
| 4 |
-
*.bz2 filter=lfs diff=lfs merge=lfs -text
|
| 5 |
-
*.ckpt filter=lfs diff=lfs merge=lfs -text
|
| 6 |
-
*.ftz filter=lfs diff=lfs merge=lfs -text
|
| 7 |
-
*.gz filter=lfs diff=lfs merge=lfs -text
|
| 8 |
-
*.h5 filter=lfs diff=lfs merge=lfs -text
|
| 9 |
-
*.joblib filter=lfs diff=lfs merge=lfs -text
|
| 10 |
-
*.lfs.* filter=lfs diff=lfs merge=lfs -text
|
| 11 |
-
*.mlmodel filter=lfs diff=lfs merge=lfs -text
|
| 12 |
-
*.model filter=lfs diff=lfs merge=lfs -text
|
| 13 |
-
*.msgpack filter=lfs diff=lfs merge=lfs -text
|
| 14 |
-
*.npy filter=lfs diff=lfs merge=lfs -text
|
| 15 |
-
*.npz filter=lfs diff=lfs merge=lfs -text
|
| 16 |
-
*.onnx filter=lfs diff=lfs merge=lfs -text
|
| 17 |
-
*.ot filter=lfs diff=lfs merge=lfs -text
|
| 18 |
-
*.parquet filter=lfs diff=lfs merge=lfs -text
|
| 19 |
-
*.pb filter=lfs diff=lfs merge=lfs -text
|
| 20 |
-
*.pickle filter=lfs diff=lfs merge=lfs -text
|
| 21 |
-
*.pkl filter=lfs diff=lfs merge=lfs -text
|
| 22 |
-
*.pt filter=lfs diff=lfs merge=lfs -text
|
| 23 |
-
*.pth filter=lfs diff=lfs merge=lfs -text
|
| 24 |
-
*.rar filter=lfs diff=lfs merge=lfs -text
|
| 25 |
-
*.safetensors filter=lfs diff=lfs merge=lfs -text
|
| 26 |
-
saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
| 27 |
-
*.tar.* filter=lfs diff=lfs merge=lfs -text
|
| 28 |
-
*.tar filter=lfs diff=lfs merge=lfs -text
|
| 29 |
-
*.tflite filter=lfs diff=lfs merge=lfs -text
|
| 30 |
-
*.tgz filter=lfs diff=lfs merge=lfs -text
|
| 31 |
-
*.wasm filter=lfs diff=lfs merge=lfs -text
|
| 32 |
-
*.xz filter=lfs diff=lfs merge=lfs -text
|
| 33 |
-
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
-
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
-
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ProteinLocationPredictor/README.md
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: mit
|
| 3 |
-
---
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/EDA_Psort.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
notebooks/ESMC_300m.ipynb
CHANGED
|
@@ -1,421 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": 1,
|
| 6 |
-
"id": "c409c4ad",
|
| 7 |
-
"metadata": {},
|
| 8 |
-
"outputs": [],
|
| 9 |
-
"source": [
|
| 10 |
-
"from esm.models.esmc import ESMC\n",
|
| 11 |
-
"from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput\n",
|
| 12 |
-
"from esm.sdk.forge import ESM3ForgeInferenceClient\n",
|
| 13 |
-
"import pandas as pd\n",
|
| 14 |
-
"import os\n",
|
| 15 |
-
"from concurrent.futures import ProcessPoolExecutor, as_completed\n",
|
| 16 |
-
"from tqdm import tqdm\n",
|
| 17 |
-
"import numpy as np\n",
|
| 18 |
-
"import os\n",
|
| 19 |
-
"import torch\n",
|
| 20 |
-
"import gc"
|
| 21 |
-
]
|
| 22 |
-
},
|
| 23 |
-
{
|
| 24 |
-
"cell_type": "code",
|
| 25 |
-
"execution_count": 4,
|
| 26 |
-
"id": "7f8f916c",
|
| 27 |
-
"metadata": {},
|
| 28 |
-
"outputs": [
|
| 29 |
-
{
|
| 30 |
-
"data": {
|
| 31 |
-
"text/html": [
|
| 32 |
-
"<div>\n",
|
| 33 |
-
"<style scoped>\n",
|
| 34 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
| 35 |
-
" vertical-align: middle;\n",
|
| 36 |
-
" }\n",
|
| 37 |
-
"\n",
|
| 38 |
-
" .dataframe tbody tr th {\n",
|
| 39 |
-
" vertical-align: top;\n",
|
| 40 |
-
" }\n",
|
| 41 |
-
"\n",
|
| 42 |
-
" .dataframe thead th {\n",
|
| 43 |
-
" text-align: right;\n",
|
| 44 |
-
" }\n",
|
| 45 |
-
"</style>\n",
|
| 46 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
| 47 |
-
" <thead>\n",
|
| 48 |
-
" <tr style=\"text-align: right;\">\n",
|
| 49 |
-
" <th></th>\n",
|
| 50 |
-
" <th>SwissProt_ID</th>\n",
|
| 51 |
-
" <th>Refseq_Accession</th>\n",
|
| 52 |
-
" <th>Other_Accession</th>\n",
|
| 53 |
-
" <th>GramStain</th>\n",
|
| 54 |
-
" <th>Experimental_Localization</th>\n",
|
| 55 |
-
" <th>Phylum</th>\n",
|
| 56 |
-
" <th>Class</th>\n",
|
| 57 |
-
" <th>Organism</th>\n",
|
| 58 |
-
" <th>sequence</th>\n",
|
| 59 |
-
" </tr>\n",
|
| 60 |
-
" </thead>\n",
|
| 61 |
-
" <tbody>\n",
|
| 62 |
-
" <tr>\n",
|
| 63 |
-
" <th>0</th>\n",
|
| 64 |
-
" <td>P50307</td>\n",
|
| 65 |
-
" <td>NaN</td>\n",
|
| 66 |
-
" <td>NaN</td>\n",
|
| 67 |
-
" <td>Gram positive</td>\n",
|
| 68 |
-
" <td>Cytoplasmic</td>\n",
|
| 69 |
-
" <td>Firmicutes</td>\n",
|
| 70 |
-
" <td>Bacilli</td>\n",
|
| 71 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 72 |
-
" <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
|
| 73 |
-
" </tr>\n",
|
| 74 |
-
" <tr>\n",
|
| 75 |
-
" <th>1</th>\n",
|
| 76 |
-
" <td>P01552</td>\n",
|
| 77 |
-
" <td>NaN</td>\n",
|
| 78 |
-
" <td>NaN</td>\n",
|
| 79 |
-
" <td>Gram positive</td>\n",
|
| 80 |
-
" <td>Extracellular</td>\n",
|
| 81 |
-
" <td>Firmicutes</td>\n",
|
| 82 |
-
" <td>Bacilli</td>\n",
|
| 83 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 84 |
-
" <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
|
| 85 |
-
" </tr>\n",
|
| 86 |
-
" <tr>\n",
|
| 87 |
-
" <th>2</th>\n",
|
| 88 |
-
" <td>P09978</td>\n",
|
| 89 |
-
" <td>NaN</td>\n",
|
| 90 |
-
" <td>NaN</td>\n",
|
| 91 |
-
" <td>Gram positive</td>\n",
|
| 92 |
-
" <td>Extracellular</td>\n",
|
| 93 |
-
" <td>Firmicutes</td>\n",
|
| 94 |
-
" <td>Bacilli</td>\n",
|
| 95 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 96 |
-
" <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
|
| 97 |
-
" </tr>\n",
|
| 98 |
-
" <tr>\n",
|
| 99 |
-
" <th>3</th>\n",
|
| 100 |
-
" <td>P45723</td>\n",
|
| 101 |
-
" <td>NaN</td>\n",
|
| 102 |
-
" <td>NaN</td>\n",
|
| 103 |
-
" <td>Gram positive</td>\n",
|
| 104 |
-
" <td>Extracellular</td>\n",
|
| 105 |
-
" <td>Firmicutes</td>\n",
|
| 106 |
-
" <td>Bacilli</td>\n",
|
| 107 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 108 |
-
" <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
|
| 109 |
-
" </tr>\n",
|
| 110 |
-
" <tr>\n",
|
| 111 |
-
" <th>4</th>\n",
|
| 112 |
-
" <td>P81177</td>\n",
|
| 113 |
-
" <td>NaN</td>\n",
|
| 114 |
-
" <td>NaN</td>\n",
|
| 115 |
-
" <td>Gram positive</td>\n",
|
| 116 |
-
" <td>Extracellular</td>\n",
|
| 117 |
-
" <td>Firmicutes</td>\n",
|
| 118 |
-
" <td>Bacilli</td>\n",
|
| 119 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 120 |
-
" <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
|
| 121 |
-
" </tr>\n",
|
| 122 |
-
" </tbody>\n",
|
| 123 |
-
"</table>\n",
|
| 124 |
-
"</div>"
|
| 125 |
-
],
|
| 126 |
-
"text/plain": [
|
| 127 |
-
" SwissProt_ID Refseq_Accession Other_Accession GramStain \\\n",
|
| 128 |
-
"0 P50307 NaN NaN Gram positive \n",
|
| 129 |
-
"1 P01552 NaN NaN Gram positive \n",
|
| 130 |
-
"2 P09978 NaN NaN Gram positive \n",
|
| 131 |
-
"3 P45723 NaN NaN Gram positive \n",
|
| 132 |
-
"4 P81177 NaN NaN Gram positive \n",
|
| 133 |
-
"\n",
|
| 134 |
-
" Experimental_Localization Phylum Class Organism \\\n",
|
| 135 |
-
"0 Cytoplasmic Firmicutes Bacilli Staphylococcus aureus \n",
|
| 136 |
-
"1 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
|
| 137 |
-
"2 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
|
| 138 |
-
"3 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
|
| 139 |
-
"4 Extracellular Firmicutes Bacilli Staphylococcus aureus \n",
|
| 140 |
-
"\n",
|
| 141 |
-
" sequence \n",
|
| 142 |
-
"0 MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
|
| 143 |
-
"1 MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
|
| 144 |
-
"2 MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
|
| 145 |
-
"3 MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
|
| 146 |
-
"4 MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... "
|
| 147 |
-
]
|
| 148 |
-
},
|
| 149 |
-
"execution_count": 4,
|
| 150 |
-
"metadata": {},
|
| 151 |
-
"output_type": "execute_result"
|
| 152 |
-
}
|
| 153 |
-
],
|
| 154 |
-
"source": [
|
| 155 |
-
"sequences: pd.DataFrame = pd.read_csv('../Data/trainingData.csv')\n",
|
| 156 |
-
"sequences.head()"
|
| 157 |
-
]
|
| 158 |
-
},
|
| 159 |
-
{
|
| 160 |
-
"cell_type": "code",
|
| 161 |
-
"execution_count": null,
|
| 162 |
-
"id": "07a49fd0",
|
| 163 |
-
"metadata": {},
|
| 164 |
-
"outputs": [],
|
| 165 |
-
"source": [
|
| 166 |
-
"client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_300m\").to(\"cuda\")"
|
| 167 |
-
]
|
| 168 |
-
},
|
| 169 |
-
{
|
| 170 |
-
"cell_type": "code",
|
| 171 |
-
"execution_count": null,
|
| 172 |
-
"id": "e562c770",
|
| 173 |
-
"metadata": {},
|
| 174 |
-
"outputs": [],
|
| 175 |
-
"source": [
|
| 176 |
-
"# Set up output directories and metadata file.\n",
|
| 177 |
-
"embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm300m/embeddings\")\n",
|
| 178 |
-
"os.makedirs(embeddings_dir, exist_ok=True)"
|
| 179 |
-
]
|
| 180 |
-
},
|
| 181 |
-
{
|
| 182 |
-
"cell_type": "code",
|
| 183 |
-
"execution_count": null,
|
| 184 |
-
"id": "294c6798",
|
| 185 |
-
"metadata": {},
|
| 186 |
-
"outputs": [],
|
| 187 |
-
"source": [
|
| 188 |
-
"# --- Your provided function ---\n",
|
| 189 |
-
"def embed_sequence(client: ESM3ForgeInferenceClient, sequence: str) -> LogitsOutput:\n",
|
| 190 |
-
" protein = ESMProtein(sequence=sequence)\n",
|
| 191 |
-
" protein_tensor = client.encode(protein)\n",
|
| 192 |
-
" if isinstance(protein_tensor, ESMProteinError):\n",
|
| 193 |
-
" raise protein_tensor\n",
|
| 194 |
-
" output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))\n",
|
| 195 |
-
" return output\n",
|
| 196 |
-
"\n",
|
| 197 |
-
"\n",
|
| 198 |
-
"def save_emb(dir: str, df: pd.DataFrame, client: ESM3ForgeInferenceClient) -> None:\n",
|
| 199 |
-
" dir = os.path.expanduser(dir)\n",
|
| 200 |
-
" os.makedirs(dir, exist_ok=True)\n",
|
| 201 |
-
"\n",
|
| 202 |
-
" for i in tqdm(df.index, desc=\"Embedding sequences\"):\n",
|
| 203 |
-
" try:\n",
|
| 204 |
-
" output: LogitsOutput = embed_sequence(client=client, sequence=df.loc[i, 'sequence'])\n",
|
| 205 |
-
" embeddings_np: np.ndarray = output.embeddings.cpu().numpy()\n",
|
| 206 |
-
"\n",
|
| 207 |
-
" if not pd.isna(df.loc[i, 'SwissProt_ID']):\n",
|
| 208 |
-
" identifier = df.loc[i, 'SwissProt_ID']\n",
|
| 209 |
-
" elif not pd.isna(df.loc[i, 'Refseq_Accession']):\n",
|
| 210 |
-
" identifier = df.loc[i, 'Refseq_Accession']\n",
|
| 211 |
-
" elif not pd.isna(df.loc[i, 'Other_Accession']):\n",
|
| 212 |
-
" identifier = df.loc[i, 'Other_Accession']\n",
|
| 213 |
-
" else:\n",
|
| 214 |
-
" identifier = f\"unknown_{i}\"\n",
|
| 215 |
-
"\n",
|
| 216 |
-
" file_path: str = os.path.join(dir, f\"{identifier}.npy\")\n",
|
| 217 |
-
" np.save(file_path, embeddings_np)\n",
|
| 218 |
-
"\n",
|
| 219 |
-
" del output\n",
|
| 220 |
-
" gc.collect()\n",
|
| 221 |
-
" torch.cuda.empty_cache()\n",
|
| 222 |
-
"\n",
|
| 223 |
-
" except Exception as e:\n",
|
| 224 |
-
" print(f\"Error embedding index {i}: {e}\")"
|
| 225 |
-
]
|
| 226 |
-
},
|
| 227 |
-
{
|
| 228 |
-
"cell_type": "code",
|
| 229 |
-
"execution_count": null,
|
| 230 |
-
"id": "80db4990",
|
| 231 |
-
"metadata": {},
|
| 232 |
-
"outputs": [],
|
| 233 |
-
"source": [
|
| 234 |
-
"\n",
|
| 235 |
-
" \n",
|
| 236 |
-
"# Pass metadata_writer (and client if needed) to your function\n",
|
| 237 |
-
"save_emb(embeddings_dir, sequences,client = client)\n"
|
| 238 |
-
]
|
| 239 |
-
},
|
| 240 |
-
{
|
| 241 |
-
"cell_type": "code",
|
| 242 |
-
"execution_count": null,
|
| 243 |
-
"id": "77bf92c6",
|
| 244 |
-
"metadata": {},
|
| 245 |
-
"outputs": [],
|
| 246 |
-
"source": [
|
| 247 |
-
"sequences.loc[[11392]]"
|
| 248 |
-
]
|
| 249 |
-
},
|
| 250 |
-
{
|
| 251 |
-
"cell_type": "code",
|
| 252 |
-
"execution_count": 9,
|
| 253 |
-
"id": "365d9fdb",
|
| 254 |
-
"metadata": {},
|
| 255 |
-
"outputs": [],
|
| 256 |
-
"source": [
|
| 257 |
-
"sequences = sequences.drop(index=11392)"
|
| 258 |
-
]
|
| 259 |
-
},
|
| 260 |
-
{
|
| 261 |
-
"cell_type": "code",
|
| 262 |
-
"execution_count": null,
|
| 263 |
-
"id": "ad8a1990",
|
| 264 |
-
"metadata": {},
|
| 265 |
-
"outputs": [],
|
| 266 |
-
"source": [
|
| 267 |
-
"# Set up output directories and metadata file.\n",
|
| 268 |
-
"embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm600m/embeddings\")\n",
|
| 269 |
-
"os.makedirs(embeddings_dir, exist_ok=True)\n",
|
| 270 |
-
"client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_600m\").to(\"cuda\")"
|
| 271 |
-
]
|
| 272 |
-
},
|
| 273 |
-
{
|
| 274 |
-
"cell_type": "code",
|
| 275 |
-
"execution_count": null,
|
| 276 |
-
"id": "d42e5263",
|
| 277 |
-
"metadata": {},
|
| 278 |
-
"outputs": [],
|
| 279 |
-
"source": [
|
| 280 |
-
"save_emb(embeddings_dir, sequences,client = client)"
|
| 281 |
-
]
|
| 282 |
-
},
|
| 283 |
-
{
|
| 284 |
-
"cell_type": "code",
|
| 285 |
-
"execution_count": 2,
|
| 286 |
-
"id": "df91fc10",
|
| 287 |
-
"metadata": {},
|
| 288 |
-
"outputs": [],
|
| 289 |
-
"source": [
|
| 290 |
-
"def load_single_embedding(row, id_col, path):\n",
|
| 291 |
-
" try:\n",
|
| 292 |
-
" emb = np.load(os.path.join(path, f\"{row[id_col]}.npy\"))\n",
|
| 293 |
-
" emb = emb.squeeze(axis=0)\n",
|
| 294 |
-
" emb = np.mean(emb, axis=0)\n",
|
| 295 |
-
" return emb\n",
|
| 296 |
-
" except Exception as e:\n",
|
| 297 |
-
" print(f\"Error loading embedding {row[id_col]} due to {e}\")\n",
|
| 298 |
-
" return None\n",
|
| 299 |
-
"\n",
|
| 300 |
-
"def load_emb_parallel(df: pd.DataFrame, id_col: str, path: str, max_workers=None) -> list:\n",
|
| 301 |
-
" embeddings = []\n",
|
| 302 |
-
" with ProcessPoolExecutor(max_workers=max_workers) as executor:\n",
|
| 303 |
-
" futures = {\n",
|
| 304 |
-
" executor.submit(load_single_embedding, df.loc[i], id_col, path): i for i in df.index\n",
|
| 305 |
-
" }\n",
|
| 306 |
-
"\n",
|
| 307 |
-
" for future in tqdm(as_completed(futures), total=len(futures), desc=\"Loading embeddings\"):\n",
|
| 308 |
-
" emb = future.result()\n",
|
| 309 |
-
" if emb is not None:\n",
|
| 310 |
-
" embeddings.append(emb)\n",
|
| 311 |
-
" return embeddings\n",
|
| 312 |
-
"\n"
|
| 313 |
-
]
|
| 314 |
-
},
|
| 315 |
-
{
|
| 316 |
-
"cell_type": "code",
|
| 317 |
-
"execution_count": 5,
|
| 318 |
-
"id": "329701f6",
|
| 319 |
-
"metadata": {},
|
| 320 |
-
"outputs": [],
|
| 321 |
-
"source": [
|
| 322 |
-
"sequences['Preferred_ID'] = sequences['SwissProt_ID'].fillna(sequences['Refseq_Accession']).fillna(sequences['Other_Accession'])\n"
|
| 323 |
-
]
|
| 324 |
-
},
|
| 325 |
-
{
|
| 326 |
-
"cell_type": "code",
|
| 327 |
-
"execution_count": 6,
|
| 328 |
-
"id": "9b720ff2",
|
| 329 |
-
"metadata": {},
|
| 330 |
-
"outputs": [
|
| 331 |
-
{
|
| 332 |
-
"name": "stderr",
|
| 333 |
-
"output_type": "stream",
|
| 334 |
-
"text": [
|
| 335 |
-
"Loading embeddings: 97%|█████████▋| 11377/11691 [05:32<00:10, 31.20it/s]"
|
| 336 |
-
]
|
| 337 |
-
},
|
| 338 |
-
{
|
| 339 |
-
"name": "stdout",
|
| 340 |
-
"output_type": "stream",
|
| 341 |
-
"text": [
|
| 342 |
-
"Error loading embedding Q9I120 due to [Errno 2] No such file or directory: '/home/jpuglia/Documentos/Tesis/datosGenerados/esm600m/embeddings/Q9I120.npy'\n"
|
| 343 |
-
]
|
| 344 |
-
},
|
| 345 |
-
{
|
| 346 |
-
"name": "stderr",
|
| 347 |
-
"output_type": "stream",
|
| 348 |
-
"text": [
|
| 349 |
-
"Loading embeddings: 100%|██████████| 11691/11691 [05:40<00:00, 34.29it/s]\n"
|
| 350 |
-
]
|
| 351 |
-
}
|
| 352 |
-
],
|
| 353 |
-
"source": [
|
| 354 |
-
"embeddings_dir = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/esm600m/embeddings\")\n",
|
| 355 |
-
"embeddings = load_emb_parallel(sequences, 'Preferred_ID',embeddings_dir)"
|
| 356 |
-
]
|
| 357 |
-
},
|
| 358 |
-
{
|
| 359 |
-
"cell_type": "code",
|
| 360 |
-
"execution_count": 15,
|
| 361 |
-
"id": "765209e3",
|
| 362 |
-
"metadata": {},
|
| 363 |
-
"outputs": [
|
| 364 |
-
{
|
| 365 |
-
"name": "stdout",
|
| 366 |
-
"output_type": "stream",
|
| 367 |
-
"text": [
|
| 368 |
-
"Embeddings count: 11690\n",
|
| 369 |
-
"Sequences count: 11690\n"
|
| 370 |
-
]
|
| 371 |
-
}
|
| 372 |
-
],
|
| 373 |
-
"source": [
|
| 374 |
-
"print(f\"Embeddings count: {len(embeddings)}\")\n",
|
| 375 |
-
"print(f\"Sequences count: {len(sequences)}\")\n"
|
| 376 |
-
]
|
| 377 |
-
},
|
| 378 |
-
{
|
| 379 |
-
"cell_type": "code",
|
| 380 |
-
"execution_count": 17,
|
| 381 |
-
"id": "63bf7f6c",
|
| 382 |
-
"metadata": {},
|
| 383 |
-
"outputs": [
|
| 384 |
-
{
|
| 385 |
-
"data": {
|
| 386 |
-
"text/plain": [
|
| 387 |
-
"(1152,)"
|
| 388 |
-
]
|
| 389 |
-
},
|
| 390 |
-
"execution_count": 17,
|
| 391 |
-
"metadata": {},
|
| 392 |
-
"output_type": "execute_result"
|
| 393 |
-
}
|
| 394 |
-
],
|
| 395 |
-
"source": [
|
| 396 |
-
"embeddings[0].shape"
|
| 397 |
-
]
|
| 398 |
-
}
|
| 399 |
-
],
|
| 400 |
-
"metadata": {
|
| 401 |
-
"kernelspec": {
|
| 402 |
-
"display_name": "tesisEnv",
|
| 403 |
-
"language": "python",
|
| 404 |
-
"name": "python3"
|
| 405 |
-
},
|
| 406 |
-
"language_info": {
|
| 407 |
-
"codemirror_mode": {
|
| 408 |
-
"name": "ipython",
|
| 409 |
-
"version": 3
|
| 410 |
-
},
|
| 411 |
-
"file_extension": ".py",
|
| 412 |
-
"mimetype": "text/x-python",
|
| 413 |
-
"name": "python",
|
| 414 |
-
"nbconvert_exporter": "python",
|
| 415 |
-
"pygments_lexer": "ipython3",
|
| 416 |
-
"version": "3.10.16"
|
| 417 |
-
}
|
| 418 |
-
},
|
| 419 |
-
"nbformat": 4,
|
| 420 |
-
"nbformat_minor": 5
|
| 421 |
-
}
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d7bcc1ec16a5d8992cfdb6ca4d61d8c69cad64b683a697f6622e0c1f0d921076
|
| 3 |
+
size 13125
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/ESMC_600m.ipynb
CHANGED
|
@@ -1,256 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": 2,
|
| 6 |
-
"id": "c409c4ad",
|
| 7 |
-
"metadata": {},
|
| 8 |
-
"outputs": [],
|
| 9 |
-
"source": [
|
| 10 |
-
"from esm.models.esmc import ESMC\n",
|
| 11 |
-
"from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput\n",
|
| 12 |
-
"from esm.sdk.forge import ESM3ForgeInferenceClient\n",
|
| 13 |
-
"from esm.sdk import batch_executor\n",
|
| 14 |
-
"import pandas as pd\n",
|
| 15 |
-
"import os\n",
|
| 16 |
-
"import csv\n",
|
| 17 |
-
"import numpy as np\n",
|
| 18 |
-
"import torch"
|
| 19 |
-
]
|
| 20 |
-
},
|
| 21 |
-
{
|
| 22 |
-
"cell_type": "code",
|
| 23 |
-
"execution_count": 3,
|
| 24 |
-
"id": "7f8f916c",
|
| 25 |
-
"metadata": {},
|
| 26 |
-
"outputs": [
|
| 27 |
-
{
|
| 28 |
-
"data": {
|
| 29 |
-
"text/html": [
|
| 30 |
-
"<div>\n",
|
| 31 |
-
"<style scoped>\n",
|
| 32 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
| 33 |
-
" vertical-align: middle;\n",
|
| 34 |
-
" }\n",
|
| 35 |
-
"\n",
|
| 36 |
-
" .dataframe tbody tr th {\n",
|
| 37 |
-
" vertical-align: top;\n",
|
| 38 |
-
" }\n",
|
| 39 |
-
"\n",
|
| 40 |
-
" .dataframe thead th {\n",
|
| 41 |
-
" text-align: right;\n",
|
| 42 |
-
" }\n",
|
| 43 |
-
"</style>\n",
|
| 44 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
| 45 |
-
" <thead>\n",
|
| 46 |
-
" <tr style=\"text-align: right;\">\n",
|
| 47 |
-
" <th></th>\n",
|
| 48 |
-
" <th>SwissProt_ID</th>\n",
|
| 49 |
-
" <th>Experimental_Localization</th>\n",
|
| 50 |
-
" <th>Organism</th>\n",
|
| 51 |
-
" <th>Phylum</th>\n",
|
| 52 |
-
" <th>Class</th>\n",
|
| 53 |
-
" <th>GramStain</th>\n",
|
| 54 |
-
" <th>Sequence</th>\n",
|
| 55 |
-
" </tr>\n",
|
| 56 |
-
" </thead>\n",
|
| 57 |
-
" <tbody>\n",
|
| 58 |
-
" <tr>\n",
|
| 59 |
-
" <th>0</th>\n",
|
| 60 |
-
" <td>P50307</td>\n",
|
| 61 |
-
" <td>Cytoplasmic</td>\n",
|
| 62 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 63 |
-
" <td>Firmicutes</td>\n",
|
| 64 |
-
" <td>Bacilli</td>\n",
|
| 65 |
-
" <td>1.0</td>\n",
|
| 66 |
-
" <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
|
| 67 |
-
" </tr>\n",
|
| 68 |
-
" <tr>\n",
|
| 69 |
-
" <th>1</th>\n",
|
| 70 |
-
" <td>P01552</td>\n",
|
| 71 |
-
" <td>Extracellular</td>\n",
|
| 72 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 73 |
-
" <td>Firmicutes</td>\n",
|
| 74 |
-
" <td>Bacilli</td>\n",
|
| 75 |
-
" <td>1.0</td>\n",
|
| 76 |
-
" <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
|
| 77 |
-
" </tr>\n",
|
| 78 |
-
" <tr>\n",
|
| 79 |
-
" <th>2</th>\n",
|
| 80 |
-
" <td>P09978</td>\n",
|
| 81 |
-
" <td>Extracellular</td>\n",
|
| 82 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 83 |
-
" <td>Firmicutes</td>\n",
|
| 84 |
-
" <td>Bacilli</td>\n",
|
| 85 |
-
" <td>1.0</td>\n",
|
| 86 |
-
" <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
|
| 87 |
-
" </tr>\n",
|
| 88 |
-
" <tr>\n",
|
| 89 |
-
" <th>3</th>\n",
|
| 90 |
-
" <td>P45723</td>\n",
|
| 91 |
-
" <td>Extracellular</td>\n",
|
| 92 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 93 |
-
" <td>Firmicutes</td>\n",
|
| 94 |
-
" <td>Bacilli</td>\n",
|
| 95 |
-
" <td>1.0</td>\n",
|
| 96 |
-
" <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
|
| 97 |
-
" </tr>\n",
|
| 98 |
-
" <tr>\n",
|
| 99 |
-
" <th>4</th>\n",
|
| 100 |
-
" <td>P81177</td>\n",
|
| 101 |
-
" <td>Extracellular</td>\n",
|
| 102 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 103 |
-
" <td>Firmicutes</td>\n",
|
| 104 |
-
" <td>Bacilli</td>\n",
|
| 105 |
-
" <td>1.0</td>\n",
|
| 106 |
-
" <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
|
| 107 |
-
" </tr>\n",
|
| 108 |
-
" </tbody>\n",
|
| 109 |
-
"</table>\n",
|
| 110 |
-
"</div>"
|
| 111 |
-
],
|
| 112 |
-
"text/plain": [
|
| 113 |
-
" SwissProt_ID Experimental_Localization Organism Phylum \\\n",
|
| 114 |
-
"0 P50307 Cytoplasmic Staphylococcus aureus Firmicutes \n",
|
| 115 |
-
"1 P01552 Extracellular Staphylococcus aureus Firmicutes \n",
|
| 116 |
-
"2 P09978 Extracellular Staphylococcus aureus Firmicutes \n",
|
| 117 |
-
"3 P45723 Extracellular Staphylococcus aureus Firmicutes \n",
|
| 118 |
-
"4 P81177 Extracellular Staphylococcus aureus Firmicutes \n",
|
| 119 |
-
"\n",
|
| 120 |
-
" Class GramStain Sequence \n",
|
| 121 |
-
"0 Bacilli 1.0 MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
|
| 122 |
-
"1 Bacilli 1.0 MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
|
| 123 |
-
"2 Bacilli 1.0 MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
|
| 124 |
-
"3 Bacilli 1.0 MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
|
| 125 |
-
"4 Bacilli 1.0 MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... "
|
| 126 |
-
]
|
| 127 |
-
},
|
| 128 |
-
"execution_count": 3,
|
| 129 |
-
"metadata": {},
|
| 130 |
-
"output_type": "execute_result"
|
| 131 |
-
}
|
| 132 |
-
],
|
| 133 |
-
"source": [
|
| 134 |
-
"sequences: pd.DataFrame = pd.read_csv('/home/jpuglia/Documentos/Tesis/tesisESM/Data/trainingData.csv')\n",
|
| 135 |
-
"sequences.head()"
|
| 136 |
-
]
|
| 137 |
-
},
|
| 138 |
-
{
|
| 139 |
-
"cell_type": "code",
|
| 140 |
-
"execution_count": null,
|
| 141 |
-
"id": "d7026979",
|
| 142 |
-
"metadata": {},
|
| 143 |
-
"outputs": [
|
| 144 |
-
{
|
| 145 |
-
"ename": "ValueError",
|
| 146 |
-
"evalue": "The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().",
|
| 147 |
-
"output_type": "error",
|
| 148 |
-
"traceback": [
|
| 149 |
-
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
|
| 150 |
-
"\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
|
| 151 |
-
"\u001b[0;32m/tmp/ipykernel_118460/767462261.py\u001b[0m in \u001b[0;36m?\u001b[0;34m()\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[0misfloat\u001b[0m\u001b[0;34m:\u001b[0m \u001b[0mbool\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'Sequence'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mapply\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;32mlambda\u001b[0m \u001b[0mx\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0misinstance\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0mfloat\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 2\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 3\u001b[0;31m \u001b[0msequences\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;32mnot\u001b[0m \u001b[0msequences\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0misfloat\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
|
| 152 |
-
"\u001b[0;32m~/miniconda3/envs/tesisEnv/lib/python3.10/site-packages/pandas/core/generic.py\u001b[0m in \u001b[0;36m?\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1575\u001b[0m \u001b[0;34m@\u001b[0m\u001b[0mfinal\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1576\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0m__nonzero__\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mself\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m->\u001b[0m \u001b[0mNoReturn\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m-> 1577\u001b[0;31m raise ValueError(\n\u001b[0m\u001b[1;32m 1578\u001b[0m \u001b[0;34mf\"The truth value of a {type(self).__name__} is ambiguous. \"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1579\u001b[0m \u001b[0;34m\"Use a.empty, a.bool(), a.item(), a.any() or a.all().\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 1580\u001b[0m )\n",
|
| 153 |
-
"\u001b[0;31mValueError\u001b[0m: The truth value of a DataFrame is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all()."
|
| 154 |
-
]
|
| 155 |
-
}
|
| 156 |
-
],
|
| 157 |
-
"source": [
|
| 158 |
-
"isfloat: bool = sequences['Sequence'].apply(lambda x:isinstance(x,float))\n",
|
| 159 |
-
"\n",
|
| 160 |
-
"sequences = sequences[~isfloat]"
|
| 161 |
-
]
|
| 162 |
-
},
|
| 163 |
-
{
|
| 164 |
-
"cell_type": "code",
|
| 165 |
-
"execution_count": null,
|
| 166 |
-
"id": "ea723ad9",
|
| 167 |
-
"metadata": {},
|
| 168 |
-
"outputs": [],
|
| 169 |
-
"source": [
|
| 170 |
-
"sequences = sequences.dropna()\n",
|
| 171 |
-
"sequences = sequences.drop_duplicates()\n",
|
| 172 |
-
"sequences.shape"
|
| 173 |
-
]
|
| 174 |
-
},
|
| 175 |
-
{
|
| 176 |
-
"cell_type": "code",
|
| 177 |
-
"execution_count": null,
|
| 178 |
-
"id": "07a49fd0",
|
| 179 |
-
"metadata": {},
|
| 180 |
-
"outputs": [],
|
| 181 |
-
"source": [
|
| 182 |
-
"torch.cuda.empty_cache()\n",
|
| 183 |
-
"client: ESM3ForgeInferenceClient = ESMC.from_pretrained(\"esmc_600m\").to(\"cuda\")"
|
| 184 |
-
]
|
| 185 |
-
},
|
| 186 |
-
{
|
| 187 |
-
"cell_type": "code",
|
| 188 |
-
"execution_count": null,
|
| 189 |
-
"id": "294c6798",
|
| 190 |
-
"metadata": {},
|
| 191 |
-
"outputs": [],
|
| 192 |
-
"source": [
|
| 193 |
-
"# Set up output directories and metadata file.\n",
|
| 194 |
-
"embeddings_dir = \"/home/jpuglia/Documentos/Tesis/datosGenerados/esm600m/embeddings\"\n",
|
| 195 |
-
"os.makedirs(embeddings_dir, exist_ok=True)\n",
|
| 196 |
-
"\n",
|
| 197 |
-
"def embed_sequence(client: ESM3ForgeInferenceClient, sequence: str) -> LogitsOutput:\n",
|
| 198 |
-
" \n",
|
| 199 |
-
" protein = ESMProtein(sequence=sequence)\n",
|
| 200 |
-
" protein_tensor = client.encode(protein)\n",
|
| 201 |
-
" if isinstance(protein_tensor, ESMProteinError):\n",
|
| 202 |
-
" raise protein_tensor\n",
|
| 203 |
-
" output = client.logits(protein_tensor, LogitsConfig(sequence=True, return_embeddings=True))\n",
|
| 204 |
-
" return output\n",
|
| 205 |
-
"\n",
|
| 206 |
-
"\n",
|
| 207 |
-
"def save_emb(dir: str, df : pd.DataFrame) -> None:\n",
|
| 208 |
-
" \n",
|
| 209 |
-
" for i in df.index:\n",
|
| 210 |
-
" \n",
|
| 211 |
-
" output: LogitsOutput = embed_sequence(client = client, sequence = df.loc[i, 'Sequence'])\n",
|
| 212 |
-
" \n",
|
| 213 |
-
" embeddings_np : np.ndarray = output.embeddings.cpu().numpy()\n",
|
| 214 |
-
" \n",
|
| 215 |
-
" file_path : str = os.path.join(dir,f\"{df.loc[i, 'SwissProt_ID']}.npy\") \n",
|
| 216 |
-
"\n",
|
| 217 |
-
" np.save(file_path, embeddings_np)\n",
|
| 218 |
-
" \n",
|
| 219 |
-
" del output\n",
|
| 220 |
-
" \n",
|
| 221 |
-
" torch.cuda.empty_cache()"
|
| 222 |
-
]
|
| 223 |
-
},
|
| 224 |
-
{
|
| 225 |
-
"cell_type": "code",
|
| 226 |
-
"execution_count": null,
|
| 227 |
-
"id": "80db4990",
|
| 228 |
-
"metadata": {},
|
| 229 |
-
"outputs": [],
|
| 230 |
-
"source": [
|
| 231 |
-
"save_emb(embeddings_dir, sequences)\n"
|
| 232 |
-
]
|
| 233 |
-
}
|
| 234 |
-
],
|
| 235 |
-
"metadata": {
|
| 236 |
-
"kernelspec": {
|
| 237 |
-
"display_name": "tesisEnv",
|
| 238 |
-
"language": "python",
|
| 239 |
-
"name": "python3"
|
| 240 |
-
},
|
| 241 |
-
"language_info": {
|
| 242 |
-
"codemirror_mode": {
|
| 243 |
-
"name": "ipython",
|
| 244 |
-
"version": 3
|
| 245 |
-
},
|
| 246 |
-
"file_extension": ".py",
|
| 247 |
-
"mimetype": "text/x-python",
|
| 248 |
-
"name": "python",
|
| 249 |
-
"nbconvert_exporter": "python",
|
| 250 |
-
"pygments_lexer": "ipython3",
|
| 251 |
-
"version": "3.10.16"
|
| 252 |
-
}
|
| 253 |
-
},
|
| 254 |
-
"nbformat": 4,
|
| 255 |
-
"nbformat_minor": 5
|
| 256 |
-
}
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:06608effffc76bbb3ca65c67a84f06762d459b56a823bff9e61695cab83bb10c
|
| 3 |
+
size 10350
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/EmbAnalisis.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f56a416d1a8fb454ba368583013118d8fc490964dd036d3b3ce8c5879a4393b3
|
| 3 |
+
size 10635423
|
notebooks/ProstT5.ipynb
CHANGED
|
@@ -1,526 +1,3 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
| 4 |
-
"cell_type": "code",
|
| 5 |
-
"execution_count": 1,
|
| 6 |
-
"id": "40b1e04a",
|
| 7 |
-
"metadata": {},
|
| 8 |
-
"outputs": [],
|
| 9 |
-
"source": [
|
| 10 |
-
"import pandas as pd\n",
|
| 11 |
-
"from transformers import T5Tokenizer, T5EncoderModel\n",
|
| 12 |
-
"import torch\n",
|
| 13 |
-
"import re\n",
|
| 14 |
-
"from tqdm.notebook import tqdm\n",
|
| 15 |
-
"import os\n",
|
| 16 |
-
"import numpy as np\n",
|
| 17 |
-
"import gc\n",
|
| 18 |
-
"\n",
|
| 19 |
-
"os.environ[\"PYTORCH_CUDA_ALLOC_CONF\"] = \"expandable_segments:True\"\n"
|
| 20 |
-
]
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"cell_type": "code",
|
| 24 |
-
"execution_count": 2,
|
| 25 |
-
"id": "f4c8ff50",
|
| 26 |
-
"metadata": {},
|
| 27 |
-
"outputs": [
|
| 28 |
-
{
|
| 29 |
-
"data": {
|
| 30 |
-
"text/html": [
|
| 31 |
-
"<div>\n",
|
| 32 |
-
"<style scoped>\n",
|
| 33 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
| 34 |
-
" vertical-align: middle;\n",
|
| 35 |
-
" }\n",
|
| 36 |
-
"\n",
|
| 37 |
-
" .dataframe tbody tr th {\n",
|
| 38 |
-
" vertical-align: top;\n",
|
| 39 |
-
" }\n",
|
| 40 |
-
"\n",
|
| 41 |
-
" .dataframe thead th {\n",
|
| 42 |
-
" text-align: right;\n",
|
| 43 |
-
" }\n",
|
| 44 |
-
"</style>\n",
|
| 45 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
| 46 |
-
" <thead>\n",
|
| 47 |
-
" <tr style=\"text-align: right;\">\n",
|
| 48 |
-
" <th></th>\n",
|
| 49 |
-
" <th>GramStain</th>\n",
|
| 50 |
-
" <th>Experimental_Localization</th>\n",
|
| 51 |
-
" <th>Phylum</th>\n",
|
| 52 |
-
" <th>Class</th>\n",
|
| 53 |
-
" <th>Organism</th>\n",
|
| 54 |
-
" <th>sequence</th>\n",
|
| 55 |
-
" <th>id</th>\n",
|
| 56 |
-
" </tr>\n",
|
| 57 |
-
" </thead>\n",
|
| 58 |
-
" <tbody>\n",
|
| 59 |
-
" <tr>\n",
|
| 60 |
-
" <th>0</th>\n",
|
| 61 |
-
" <td>Gram positive</td>\n",
|
| 62 |
-
" <td>Cytoplasmic</td>\n",
|
| 63 |
-
" <td>Firmicutes</td>\n",
|
| 64 |
-
" <td>Bacilli</td>\n",
|
| 65 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 66 |
-
" <td>MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET...</td>\n",
|
| 67 |
-
" <td>P50307</td>\n",
|
| 68 |
-
" </tr>\n",
|
| 69 |
-
" <tr>\n",
|
| 70 |
-
" <th>1</th>\n",
|
| 71 |
-
" <td>Gram positive</td>\n",
|
| 72 |
-
" <td>Extracellular</td>\n",
|
| 73 |
-
" <td>Firmicutes</td>\n",
|
| 74 |
-
" <td>Bacilli</td>\n",
|
| 75 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 76 |
-
" <td>MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG...</td>\n",
|
| 77 |
-
" <td>P01552</td>\n",
|
| 78 |
-
" </tr>\n",
|
| 79 |
-
" <tr>\n",
|
| 80 |
-
" <th>2</th>\n",
|
| 81 |
-
" <td>Gram positive</td>\n",
|
| 82 |
-
" <td>Extracellular</td>\n",
|
| 83 |
-
" <td>Firmicutes</td>\n",
|
| 84 |
-
" <td>Bacilli</td>\n",
|
| 85 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 86 |
-
" <td>MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV...</td>\n",
|
| 87 |
-
" <td>P09978</td>\n",
|
| 88 |
-
" </tr>\n",
|
| 89 |
-
" <tr>\n",
|
| 90 |
-
" <th>3</th>\n",
|
| 91 |
-
" <td>Gram positive</td>\n",
|
| 92 |
-
" <td>Extracellular</td>\n",
|
| 93 |
-
" <td>Firmicutes</td>\n",
|
| 94 |
-
" <td>Bacilli</td>\n",
|
| 95 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 96 |
-
" <td>MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT...</td>\n",
|
| 97 |
-
" <td>P45723</td>\n",
|
| 98 |
-
" </tr>\n",
|
| 99 |
-
" <tr>\n",
|
| 100 |
-
" <th>4</th>\n",
|
| 101 |
-
" <td>Gram positive</td>\n",
|
| 102 |
-
" <td>Extracellular</td>\n",
|
| 103 |
-
" <td>Firmicutes</td>\n",
|
| 104 |
-
" <td>Bacilli</td>\n",
|
| 105 |
-
" <td>Staphylococcus aureus</td>\n",
|
| 106 |
-
" <td>MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK...</td>\n",
|
| 107 |
-
" <td>P81177</td>\n",
|
| 108 |
-
" </tr>\n",
|
| 109 |
-
" </tbody>\n",
|
| 110 |
-
"</table>\n",
|
| 111 |
-
"</div>"
|
| 112 |
-
],
|
| 113 |
-
"text/plain": [
|
| 114 |
-
" GramStain Experimental_Localization Phylum Class \\\n",
|
| 115 |
-
"0 Gram positive Cytoplasmic Firmicutes Bacilli \n",
|
| 116 |
-
"1 Gram positive Extracellular Firmicutes Bacilli \n",
|
| 117 |
-
"2 Gram positive Extracellular Firmicutes Bacilli \n",
|
| 118 |
-
"3 Gram positive Extracellular Firmicutes Bacilli \n",
|
| 119 |
-
"4 Gram positive Extracellular Firmicutes Bacilli \n",
|
| 120 |
-
"\n",
|
| 121 |
-
" Organism sequence \\\n",
|
| 122 |
-
"0 Staphylococcus aureus MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACET... \n",
|
| 123 |
-
"1 Staphylococcus aureus MYKRLFISHVILIFALILVISTPNVLAESQPDPKPDELHKSSKFTG... \n",
|
| 124 |
-
"2 Staphylococcus aureus MVKKTKSNSLKKVATLALANLLLVGALTDNSAKAESKKDDTDLKLV... \n",
|
| 125 |
-
"3 Staphylococcus aureus MSGWYHSAHASDSLSKSPENWMSKLDDGKHLTEINIPGSHDSGSFT... \n",
|
| 126 |
-
"4 Staphylococcus aureus MRKFSRYAFTSMATVTLLSSLTPAALASDTNHKPATSDINFEITQK... \n",
|
| 127 |
-
"\n",
|
| 128 |
-
" id \n",
|
| 129 |
-
"0 P50307 \n",
|
| 130 |
-
"1 P01552 \n",
|
| 131 |
-
"2 P09978 \n",
|
| 132 |
-
"3 P45723 \n",
|
| 133 |
-
"4 P81177 "
|
| 134 |
-
]
|
| 135 |
-
},
|
| 136 |
-
"execution_count": 2,
|
| 137 |
-
"metadata": {},
|
| 138 |
-
"output_type": "execute_result"
|
| 139 |
-
}
|
| 140 |
-
],
|
| 141 |
-
"source": [
|
| 142 |
-
"sequences_df = pd.read_csv('../Data/trainingData.csv')\n",
|
| 143 |
-
"sequences_df['id'] = sequences_df['SwissProt_ID'].fillna(sequences_df['Refseq_Accession'].fillna(sequences_df['Other_Accession']))\n",
|
| 144 |
-
"sequences_df = sequences_df.drop(columns=['SwissProt_ID', 'Refseq_Accession', 'Other_Accession'])\n",
|
| 145 |
-
"sequences_df.head()"
|
| 146 |
-
]
|
| 147 |
-
},
|
| 148 |
-
{
|
| 149 |
-
"cell_type": "code",
|
| 150 |
-
"execution_count": 3,
|
| 151 |
-
"id": "6925775b",
|
| 152 |
-
"metadata": {},
|
| 153 |
-
"outputs": [
|
| 154 |
-
{
|
| 155 |
-
"name": "stdout",
|
| 156 |
-
"output_type": "stream",
|
| 157 |
-
"text": [
|
| 158 |
-
"Secuencias 11691\n",
|
| 159 |
-
"Ids 11691\n"
|
| 160 |
-
]
|
| 161 |
-
}
|
| 162 |
-
],
|
| 163 |
-
"source": [
|
| 164 |
-
"sequences = list(sequences_df['sequence'].values)\n",
|
| 165 |
-
"accession = list(sequences_df['id'].values)\n",
|
| 166 |
-
"\n",
|
| 167 |
-
"print(f\"Secuencias {len(sequences)}\\nIds {len(accession)}\")"
|
| 168 |
-
]
|
| 169 |
-
},
|
| 170 |
-
{
|
| 171 |
-
"cell_type": "code",
|
| 172 |
-
"execution_count": 4,
|
| 173 |
-
"id": "c19ac1ba",
|
| 174 |
-
"metadata": {},
|
| 175 |
-
"outputs": [],
|
| 176 |
-
"source": [
|
| 177 |
-
"path = os.path.expanduser(\"~/Documentos/Tesis/datosGenerados/prost/embeddings\")"
|
| 178 |
-
]
|
| 179 |
-
},
|
| 180 |
-
{
|
| 181 |
-
"cell_type": "code",
|
| 182 |
-
"execution_count": 5,
|
| 183 |
-
"id": "5b5e321e",
|
| 184 |
-
"metadata": {},
|
| 185 |
-
"outputs": [
|
| 186 |
-
{
|
| 187 |
-
"name": "stderr",
|
| 188 |
-
"output_type": "stream",
|
| 189 |
-
"text": [
|
| 190 |
-
"You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565\n"
|
| 191 |
-
]
|
| 192 |
-
},
|
| 193 |
-
{
|
| 194 |
-
"data": {
|
| 195 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 196 |
-
"model_id": "17d989ac426c445dbfd209d0247a9a3d",
|
| 197 |
-
"version_major": 2,
|
| 198 |
-
"version_minor": 0
|
| 199 |
-
},
|
| 200 |
-
"text/plain": [
|
| 201 |
-
"Processing Sequences: 0%| | 0/11691 [00:00<?, ?it/s]"
|
| 202 |
-
]
|
| 203 |
-
},
|
| 204 |
-
"metadata": {},
|
| 205 |
-
"output_type": "display_data"
|
| 206 |
-
},
|
| 207 |
-
{
|
| 208 |
-
"name": "stdout",
|
| 209 |
-
"output_type": "stream",
|
| 210 |
-
"text": [
|
| 211 |
-
"Error CUDA out of memory. Tried to allocate 1.64 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.02 GiB is free. Including non-PyTorch memory, this process has 4.11 GiB memory in use. Of the allocated memory 4.00 GiB is allocated by PyTorch, and 10.56 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba CAC14227\n",
|
| 212 |
-
"Error CUDA out of memory. Tried to allocate 1.54 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.09 GiB is free. Including non-PyTorch memory, this process has 4.03 GiB memory in use. Of the allocated memory 3.89 GiB is allocated by PyTorch, and 36.17 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba P12255\n",
|
| 213 |
-
"Error CUDA out of memory. Tried to allocate 982.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 748.44 MiB is free. Including non-PyTorch memory, this process has 4.40 GiB memory in use. Of the allocated memory 4.25 GiB is allocated by PyTorch, and 51.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba P20471\n",
|
| 214 |
-
"Error CUDA out of memory. Tried to allocate 1024.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 711.88 MiB is free. Including non-PyTorch memory, this process has 4.48 GiB memory in use. Of the allocated memory 4.33 GiB is allocated by PyTorch, and 44.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba A64556\n",
|
| 215 |
-
"Error CUDA out of memory. Tried to allocate 1.28 GiB. GPU 0 has a total capacity of 5.59 GiB of which 111.88 MiB is free. Including non-PyTorch memory, this process has 5.07 GiB memory in use. Of the allocated memory 4.90 GiB is allocated by PyTorch, and 67.79 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba AAF25576\n",
|
| 216 |
-
"Error CUDA out of memory. Tried to allocate 1.55 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.14 GiB is free. Including non-PyTorch memory, this process has 4.03 GiB memory in use. Of the allocated memory 3.91 GiB is allocated by PyTorch, and 19.85 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q4L9P0\n",
|
| 217 |
-
"Error CUDA out of memory. Tried to allocate 1.04 GiB. GPU 0 has a total capacity of 5.59 GiB of which 591.88 MiB is free. Including non-PyTorch memory, this process has 4.60 GiB memory in use. Of the allocated memory 4.45 GiB is allocated by PyTorch, and 40.12 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I5N6\n",
|
| 218 |
-
"Error CUDA out of memory. Tried to allocate 1.49 GiB. GPU 0 has a total capacity of 5.59 GiB of which 1.22 GiB is free. Including non-PyTorch memory, this process has 3.95 GiB memory in use. Of the allocated memory 3.84 GiB is allocated by PyTorch, and 5.53 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I791\n",
|
| 219 |
-
"Error CUDA out of memory. Tried to allocate 32.00 MiB. GPU 0 has a total capacity of 5.59 GiB of which 31.88 MiB is free. Including non-PyTorch memory, this process has 5.14 GiB memory in use. Of the allocated memory 5.01 GiB is allocated by PyTorch, and 36.22 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables) mientras se procesaba Q9I120\n"
|
| 220 |
-
]
|
| 221 |
-
}
|
| 222 |
-
],
|
| 223 |
-
"source": [
|
| 224 |
-
"# Setup device\n",
|
| 225 |
-
"device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')\n",
|
| 226 |
-
"\n",
|
| 227 |
-
"# Load tokenizer and model\n",
|
| 228 |
-
"tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)\n",
|
| 229 |
-
"model = T5EncoderModel.from_pretrained(\"Rostlab/ProstT5\").to(device)\n",
|
| 230 |
-
"model.full() if device == 'cpu' else model.half()\n",
|
| 231 |
-
"\n",
|
| 232 |
-
"# Clean sequences\n",
|
| 233 |
-
"sequences = [\" \".join(list(re.sub(r\"[UZOB]\", \"X\", s))) for s in sequences]\n",
|
| 234 |
-
"sequences = [ \"<AA2fold> \" + s for s in sequences]\n",
|
| 235 |
-
"\n",
|
| 236 |
-
"# Process each sequence individually\n",
|
| 237 |
-
"for i, (seq, acc_id) in enumerate(tqdm(zip(sequences, accession), total=len(sequences), desc=\"Processing Sequences\")):\n",
|
| 238 |
-
" try:\n",
|
| 239 |
-
" # Tokenize\n",
|
| 240 |
-
" ids = tokenizer(\n",
|
| 241 |
-
" seq,\n",
|
| 242 |
-
" add_special_tokens=True,\n",
|
| 243 |
-
" return_tensors='pt'\n",
|
| 244 |
-
" ).to(device)\n",
|
| 245 |
-
"\n",
|
| 246 |
-
" # Forward pass\n",
|
| 247 |
-
" with torch.no_grad():\n",
|
| 248 |
-
" embedding_repr = model(\n",
|
| 249 |
-
" ids.input_ids,\n",
|
| 250 |
-
" attention_mask=ids.attention_mask\n",
|
| 251 |
-
" )\n",
|
| 252 |
-
"\n",
|
| 253 |
-
" # Compute actual length (excluding prefix)\n",
|
| 254 |
-
" real_len = ids.attention_mask[0].sum().item() - 1\n",
|
| 255 |
-
"\n",
|
| 256 |
-
" # Extract and average embeddings\n",
|
| 257 |
-
" emb = embedding_repr.last_hidden_state[0, 1:real_len]\n",
|
| 258 |
-
" emb_avg = emb.mean(dim=0).cpu().numpy()\n",
|
| 259 |
-
"\n",
|
| 260 |
-
" # Save embedding using accession ID\n",
|
| 261 |
-
" np.save(os.path.join(path, f\"{acc_id}.npy\"), emb_avg)\n",
|
| 262 |
-
"\n",
|
| 263 |
-
"\n",
|
| 264 |
-
" # Cleanup\n",
|
| 265 |
-
" del ids, embedding_repr, emb, emb_avg\n",
|
| 266 |
-
" torch.cuda.empty_cache()\n",
|
| 267 |
-
" gc.collect()\n",
|
| 268 |
-
"\n",
|
| 269 |
-
" except RuntimeError as e:\n",
|
| 270 |
-
" print(f\"Error {e} mientras se procesaba {acc_id}\")\n",
|
| 271 |
-
"\n"
|
| 272 |
-
]
|
| 273 |
-
},
|
| 274 |
-
{
|
| 275 |
-
"cell_type": "code",
|
| 276 |
-
"execution_count": 6,
|
| 277 |
-
"id": "9ca9cb2d",
|
| 278 |
-
"metadata": {},
|
| 279 |
-
"outputs": [
|
| 280 |
-
{
|
| 281 |
-
"data": {
|
| 282 |
-
"text/html": [
|
| 283 |
-
"<div>\n",
|
| 284 |
-
"<style scoped>\n",
|
| 285 |
-
" .dataframe tbody tr th:only-of-type {\n",
|
| 286 |
-
" vertical-align: middle;\n",
|
| 287 |
-
" }\n",
|
| 288 |
-
"\n",
|
| 289 |
-
" .dataframe tbody tr th {\n",
|
| 290 |
-
" vertical-align: top;\n",
|
| 291 |
-
" }\n",
|
| 292 |
-
"\n",
|
| 293 |
-
" .dataframe thead th {\n",
|
| 294 |
-
" text-align: right;\n",
|
| 295 |
-
" }\n",
|
| 296 |
-
"</style>\n",
|
| 297 |
-
"<table border=\"1\" class=\"dataframe\">\n",
|
| 298 |
-
" <thead>\n",
|
| 299 |
-
" <tr style=\"text-align: right;\">\n",
|
| 300 |
-
" <th></th>\n",
|
| 301 |
-
" <th>GramStain</th>\n",
|
| 302 |
-
" <th>Experimental_Localization</th>\n",
|
| 303 |
-
" <th>Phylum</th>\n",
|
| 304 |
-
" <th>Class</th>\n",
|
| 305 |
-
" <th>Organism</th>\n",
|
| 306 |
-
" <th>sequence</th>\n",
|
| 307 |
-
" <th>id</th>\n",
|
| 308 |
-
" </tr>\n",
|
| 309 |
-
" </thead>\n",
|
| 310 |
-
" <tbody>\n",
|
| 311 |
-
" <tr>\n",
|
| 312 |
-
" <th>1532</th>\n",
|
| 313 |
-
" <td>Gram negative</td>\n",
|
| 314 |
-
" <td>OuterMembrane,Extracellular</td>\n",
|
| 315 |
-
" <td>Proteobacteria</td>\n",
|
| 316 |
-
" <td>Gammaproteobacteria</td>\n",
|
| 317 |
-
" <td>Yersinia pestis</td>\n",
|
| 318 |
-
" <td>MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQ...</td>\n",
|
| 319 |
-
" <td>CAC14227</td>\n",
|
| 320 |
-
" </tr>\n",
|
| 321 |
-
" <tr>\n",
|
| 322 |
-
" <th>1683</th>\n",
|
| 323 |
-
" <td>Gram negative</td>\n",
|
| 324 |
-
" <td>OuterMembrane</td>\n",
|
| 325 |
-
" <td>Proteobacteria</td>\n",
|
| 326 |
-
" <td>Betaproteobacteria</td>\n",
|
| 327 |
-
" <td>Bordetella pertussis</td>\n",
|
| 328 |
-
" <td>MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATS...</td>\n",
|
| 329 |
-
" <td>P12255</td>\n",
|
| 330 |
-
" </tr>\n",
|
| 331 |
-
" <tr>\n",
|
| 332 |
-
" <th>1767</th>\n",
|
| 333 |
-
" <td>Gram negative</td>\n",
|
| 334 |
-
" <td>CytoplasmicMembrane</td>\n",
|
| 335 |
-
" <td>Proteobacteria</td>\n",
|
| 336 |
-
" <td>Alphaproteobacteria</td>\n",
|
| 337 |
-
" <td>Sinorhizobium meliloti</td>\n",
|
| 338 |
-
" <td>MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKG...</td>\n",
|
| 339 |
-
" <td>P20471</td>\n",
|
| 340 |
-
" </tr>\n",
|
| 341 |
-
" <tr>\n",
|
| 342 |
-
" <th>4089</th>\n",
|
| 343 |
-
" <td>Gram negative</td>\n",
|
| 344 |
-
" <td>OuterMembrane,Extracellular</td>\n",
|
| 345 |
-
" <td>Proteobacteria</td>\n",
|
| 346 |
-
" <td>Epsilonproteobacteria</td>\n",
|
| 347 |
-
" <td>Helicobacter pylori</td>\n",
|
| 348 |
-
" <td>MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLW...</td>\n",
|
| 349 |
-
" <td>A64556</td>\n",
|
| 350 |
-
" </tr>\n",
|
| 351 |
-
" <tr>\n",
|
| 352 |
-
" <th>4623</th>\n",
|
| 353 |
-
" <td>Gram positive</td>\n",
|
| 354 |
-
" <td>Cellwall</td>\n",
|
| 355 |
-
" <td>Firmicutes</td>\n",
|
| 356 |
-
" <td>Bacilli</td>\n",
|
| 357 |
-
" <td>Lactobacillus reuteri</td>\n",
|
| 358 |
-
" <td>MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGAT...</td>\n",
|
| 359 |
-
" <td>AAF25576</td>\n",
|
| 360 |
-
" </tr>\n",
|
| 361 |
-
" </tbody>\n",
|
| 362 |
-
"</table>\n",
|
| 363 |
-
"</div>"
|
| 364 |
-
],
|
| 365 |
-
"text/plain": [
|
| 366 |
-
" GramStain Experimental_Localization Phylum \\\n",
|
| 367 |
-
"1532 Gram negative OuterMembrane,Extracellular Proteobacteria \n",
|
| 368 |
-
"1683 Gram negative OuterMembrane Proteobacteria \n",
|
| 369 |
-
"1767 Gram negative CytoplasmicMembrane Proteobacteria \n",
|
| 370 |
-
"4089 Gram negative OuterMembrane,Extracellular Proteobacteria \n",
|
| 371 |
-
"4623 Gram positive Cellwall Firmicutes \n",
|
| 372 |
-
"\n",
|
| 373 |
-
" Class Organism \\\n",
|
| 374 |
-
"1532 Gammaproteobacteria Yersinia pestis \n",
|
| 375 |
-
"1683 Betaproteobacteria Bordetella pertussis \n",
|
| 376 |
-
"1767 Alphaproteobacteria Sinorhizobium meliloti \n",
|
| 377 |
-
"4089 Epsilonproteobacteria Helicobacter pylori \n",
|
| 378 |
-
"4623 Bacilli Lactobacillus reuteri \n",
|
| 379 |
-
"\n",
|
| 380 |
-
" sequence id \n",
|
| 381 |
-
"1532 MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQ... CAC14227 \n",
|
| 382 |
-
"1683 MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATS... P12255 \n",
|
| 383 |
-
"1767 MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKG... P20471 \n",
|
| 384 |
-
"4089 MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLW... A64556 \n",
|
| 385 |
-
"4623 MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGAT... AAF25576 "
|
| 386 |
-
]
|
| 387 |
-
},
|
| 388 |
-
"execution_count": 6,
|
| 389 |
-
"metadata": {},
|
| 390 |
-
"output_type": "execute_result"
|
| 391 |
-
}
|
| 392 |
-
],
|
| 393 |
-
"source": [
|
| 394 |
-
"cpu_ids = [\n",
|
| 395 |
-
" 'CAC14227',\n",
|
| 396 |
-
" 'P12255',\n",
|
| 397 |
-
" 'P20471',\n",
|
| 398 |
-
" 'A64556',\n",
|
| 399 |
-
" 'AAF25576',\n",
|
| 400 |
-
" 'Q4L9P0',\n",
|
| 401 |
-
" 'Q9I5N6',\n",
|
| 402 |
-
" 'Q9I791',\n",
|
| 403 |
-
" 'Q9I120'\n",
|
| 404 |
-
"]\n",
|
| 405 |
-
"\n",
|
| 406 |
-
"cpu_sequences = sequences_df[sequences_df['id'].isin(cpu_ids)]\n",
|
| 407 |
-
"cpu_sequences.head()\n"
|
| 408 |
-
]
|
| 409 |
-
},
|
| 410 |
-
{
|
| 411 |
-
"cell_type": "code",
|
| 412 |
-
"execution_count": 7,
|
| 413 |
-
"id": "a919beeb",
|
| 414 |
-
"metadata": {},
|
| 415 |
-
"outputs": [
|
| 416 |
-
{
|
| 417 |
-
"name": "stdout",
|
| 418 |
-
"output_type": "stream",
|
| 419 |
-
"text": [
|
| 420 |
-
"['MNTIFKVIWNASLNVWVVVSELAKGRIKTKSSRNLISEGVLPKFEQSMVSKLFRKNLLALSLGSIVFLSTGPVFAADITVSTQAELSAALSNGTYDKIILGADITLIGSLTVNMTSNQVVIDGQGKFGLTVNNTTNYGLVVSSGSGTLTLQNMSKIDSANYYSMVVLNGANTAVNVIYNNIDFLGSSQLIYMGAYGAATNSIMTFGDILNDVVVNDRAQEIGEVNKLAFTGRFHVTHTGSSVTSFVSTGGANNTSTMDFASGADVKIDRTGSTGDLTSTGVNAFAYTFADGASFELIANQNVFSGTTTNRGLEIGSYNSIDGFGSGVKIVLQSRSDGSIISGNGIDNATTNAAGINNNASGDANVIYNLGTGSILKATNTGILATKNANNASDIYIRSAGDITAATGISATHNGTGTVKIKNDGTITSTTAGIAISSASIKEISVDNTDGTITATAGTGVNVLASAILNLFGGTINTSATANGITFAGTEGGHTLTDLTINLLGTGIALSNVAGVNLTLSNVTLNTLNGTALNSLTGLTLVDSLNGRNTINIEGAGIGIAATNTELNTFDAEALDINVNGAGIGIQATGGGVNLSASNLIINVANTLGTALQITDGIDNTTTIGNEIQLNAENATAINFLGSSSKTLNNNGTIKGSVIFAGVADHIINNNGTLDGTLTTGAGNDTLVLDSSSQSNDVINLGDGNNSVTIQNGATVSSIITGNGNDTFTINGMSVGSTYLGSLDAGTGLNTXNXXASTDELAAATSLQGFTNINLVDSHITLVSDDNIGSGMVNIDSSSELLFGSTFDGILHATLGAGTGSAIVNNSANVSLEQASMFAGTWQVNQGGALTASNSNQLGSAKIGLDGTLNLDNIALFNHVLTGNGTLNVAKNLATTAFDFGSTVGGAFSGIVNLTKTTFALSADNAAALASATLKLSDDSVTTVGTTDRTLHGLDLSGGTLIFDGAVPQSQTSGVVTVTDLALNSGTVNITGSGSWDNTDPLATNVSILEQDRAGSTLELINATNVTGDIDALDLLVNGTAITSGTQGVQSAIQQGGSTVANAIHNYGLASSNSNGDSGLYVNYTLSALELLADGADALLLATESGLTANRVLNAELFGVGGLVVDAQNGALTLANGSNRYEGTTTVTAGELILGANGAFGQTSLLDIASGASANINGYSQTVGAVTNVGTVTLGSGGVLTSGLLTNGGILDLTGGALNLTXGGASTVAGGLTGAGTLNINGGNLSVSAANSGLSGQTHIADVASVTLTDTGTLGTSAVEVLGTLNLNGANAAMTNVLSGDGTINTNAAVTLSGNNSFSGAHQIGTDGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGAGDTLSLSGFNGTFGNSVTGSGVLQVTDDAEVTLTSSNGVSNAVTIDIADATLNLDDIALFNHVLTGNGLLNVAKNDASTAFDFGSTVGGAFSGIVNLTNTTFALSADNAAALARATLKLSDDSVTTVGATDRTLHGLDLNGGTLIFDGSPPQSQANGVVTVTDLALNSGTISITGAGNWENEHPVTPPNVSLLEQDRGDILLELINAANVTGNANNLDLLVDGTAITSGTQGVESAIQQGGSTVANAIHNYGLTSSNGNGGSGLYVNYTLSALELLANGANALLLATESGLTANRVLNAELFGVGGLVVDAQNGALTLANGNNRYEGTTTVTAGELILGANGAFGQTSLLNIASGASANINGYRQTVGAVTNSGAVTLGNGGVLTSGLLTNGGILDLTGGALNLAAGGSSTVAGGLTGAGTLNINGGDLAVSATNSGLSGQTHIADVASVTLTGTGTLGTSAVEVLGTLNLNGANAAMTNVLSGGGVINTNAAVTLSGNNSFSGAHQIGTDGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGAGDTLSLSGFNXTFGNSVTGSGVLQVTDDAEVTLTSSNXVGNTVKVDIADATLYVNDIALLDHVLTENGTLNVAKYLATTAFDXGSTVGXXFSGIVNLTNTTFALSADNAAALARATLKLSDDSVTTVGTTDRILHGLDLNGGTLIFDGSPPQSQANGVVTVTDLALNSGTISITGAGNWENEHPVTPPNVSLLEQDRGDILLQLIDADNVTGNANDLELMINGTTISAGQGVQSTVQQGGYTVANATHNYGMTSNGGSGLYVNYTLSALELLADGANALLLATESGLTANRELNAELSGVGGLVVDAQNGALTLANGNNRYEGTTTVTAGELILGANGAFGQTSLLNIASGASANINGYRQTVGAVTNTGTVTLGNGGELTSTDTLINTGMINVTDGILNLENGGASSISGGLTGNGILNIKGGDFTISIDNNGLAGQTNISDGASVTLGNGGTIIGTGNLGSSVIDVLGDLNLVADNSLANVISGDGTINTTATVTLSGNSSFSGAHQIGTNGELTVGQASNLGASSATVNLGTLTSHLILNGVSESIANVLSGVAGSTVDIIGGADTALTANNSGFLGQYALAGNSKLTVASTNNLGASSSVALAGTGDTLSLSGFNGTFGNSVTGSGVLQVTDDAEVTLTSSNGVSNAVTIDIADATLNLDDIALFNHALTGNGLLNVAKNDASTAFDFGATVGGAFTGTVNLNNSTFDLSGNNTTVLAQATLKLSSGNLTSVGNGVQNIGTLAMNGGTLLFDNIVDNAGIITSDGTIAANSINTTGGGEVRVNLPSNLAPSLDGLSVMELDEGEIIVTLATGAATGTGHELTLTDENGDPISAVTYQGVHNAGSTSAAATGSFNYGMTTGEDYDGLYVNYGLTALELLSTGSEALVLTAILANNGTQSNDLSAQITGSGDLAFASANDGSTASLSNSTNSYTGTTWVSSGNLRLDADSALGQTSLLAMSTATHVDINGTQQVVGELATEGGSTLDLNDGKLTVTGGGQIDGALTGGGELVLSGGLLNVSYDNAGFTGSTDIANGAVAHLSQAQGLGNGTINNNGTLHLDNTIGTLFNALTGSDGEVLLSNNASVQLAGDNSGYSGLFTNQAGSILIANSAEHLGGSSIANSGALILDTGSVWELTNTISGTGTLVKRGSGTVKIEGDTVSAGLTTIEEGLLQLGSSAVTQTLSLEESLQERALLVSFASNMANLTSNVLITANGSLGGYGQVTGNVENYGNLIMPNALTGGDFGTFTIDGNYTGDEGMITFNTILAGDTSVTDRLVITGDTAGQSYVTVNNIGGVGARTFEGIKIIDVGGDSAGQFTLNGRAVGGAYEYFLYQGGASTPDDGNWYLRTEADDRRPEPASYTANLAAANNMFVTSLADRMGETLYTDVFTGEQKTTSLWLRNEGSHNRSRDDSGELKTQDNRYVMQLGGDVAQWSRNAQDLWRVGVMAGYANSSSSTVAQVAGYRSTGSVDGYSVGIYGSWLADNADDTGAYVDSWVQYSWFDNRVSGQDLATEKYDSKGFTASVEGGYAFKVGESVNQSYFIQPKAQVVWMGVKADDHTETNGTVISGDGNGNIQTRLGAKAFINPSDKAKVSGPAFKPFVEANWIHNTKDFGTTLDGVTVKQAGTANIAELKLGVDGQVNSQLNLWGNIGQQVGNKGYSETSVVLGVKYNF', 'MNTNLYRLVFSHVRGMLVPVSEHCTVGNTFCGRTRGQARSGARATSLSVAPNALAWALMLACTGLPLVTHAQGLVPQGQTQVLQGGNKVPVVNIADPNSGGVSHNKFQQFNVANPGVVFNNGLTDGVSRIGGALTKNPNLTRQASAILAEVTDTSPSRLAGTLEVYGKGADLIIANPNGISVNGLSTLNASNLTLTTGRPSVNGGRIGLDVQQGTVTIERGGVNATGLGYFDVVARLVKLQGAVSSKQGKPLADIAVVAGANRYDHATRRATPIAAGARGAAAGAYAIDGTAAGAMYGKHITLVSSDSGLGVRQLGSLSSPSAITVSSQGEIALGDATVQRGPLSLKGAGVVSAGKLASGGGAVNVAGGGAVKIASASSVGNLAVQGGGKVQATLLNAGGTLLVSGRQAVQLGAASSRQALSVNAGGALKADKLSATRRVDVDGKQAVALGSASSNALSVRAGGALKAGKLSATGRLDVDGKQAVTLGSVASDGALSVSAGGNLRAKQLVSSAQLEVRGQREVALDDASSARGMTVVAAGALAARNLQSKGAIGVQGGEAVSVANANSDAELRVRGRGQVDLHDLSAARGADISGEGRVNIGRARSDSDVKVSAHGALSIDSMTALGAIGVQAGGSVSAKDMRSRGAVTVSGGGAVNLGDVQSDGQVRATSAGAMTVRDVAAAADLALQAGDALQAGFLKSAGAMTVNGRDAVRLDGAHAGGQLRVSSDGQAALGSLAAKGELTVSAARAATVAELKSLDNISVTGGERVSVQSVNSASRVAISAHGALDVGKVSAKSGIGLEGWGAVGADSLGSDGAISVSGRDAVRVDQARSLADISLGAEGGATLGAVEAAGSIDVRGGSTVAANSLHANRDVRVSGKDAVRVTAATSGGGLHVSSGRQLDLGAVQARGALALDGGAGVALQSAKASGTLHVQGGEHLDLGTLAAVGAVDVNGTGDVRVAKLVSDAGADLQAGRSMTLGIVDTTGDLQARAQQKLELGSVKSDGGLQAAAGGALSLAAAEVAGALELSGQGVTVDRASASRARIDSTGSVGIGALKAGAVEAASPRRARRALRQDFFTPGSVVVRAQGNVTVGRGDPHQGVLAQGDIIMDAKGGTLLLRNDALTENGTVTISADSAVLEHSTIESKISQSVLAAKGDKGKPAVSVKVAKKLFLNGTLRAVNDNNETMSGRQIDVVDGRPQITDAVTGEARKDESVVSDAALVADGGPIVVEAGELVSHAGGIGNGRNKENGASVTVRTTGNLVNKGYISAGKQGVLEVGGALTNEFLVGSDGTQRIEAQRIENRGTFQSQAPAGTAGALVVKAAEAIVHDGVMATKGEMQIAGKGGGSPTVTAGAKATTSANKLSVDVASWDNAGSLDIKKGGAQVTVAGRYAEHGEVSIQGDYTVSADAIALAAQVTQRGGAANLTSRHDTRFSNKIRLMGPLQVNAGGAVSNTGNLKVREGVTVTAASFDNETGAEVMAKSATLTTSGAARNAGKMQVKEAATIVAASVSNPGTFTAGKDITVTSRGGFDNEGKMESNKDIVIKTEQFSNGRVLDAKHDLTVTASGQADNRGSLKAGHDFTVQAQRIDNSGTMAAGHDATLKAPHLRNTGQVVAGHDIHIINSAKLENTGRVDARNDIALDVADFTNTGSLYAEHDATLTLAQGTQRDLVVDQDHILPVAEGTLRVKAKSLTTEIETGNPGSLIAEVQENIDNKQAIVVGKDLTLSSAHGNVANEANALLWAAGELTVKAQNITNKRAALIEAGGNARLTAAVALLNKLGRIRAGEDMHLDAPRIENTAKLSGEVQRKGVQDVGGGEHGRWSGIGYVNYWLRAGNGKKAGTIAAPWYGGDLTAEQSLIEVGKDLYLNAGARKDEHRHLLNEGVIQAGGHGHIGGDVDNRSVVRTVSAMEYFKTPLPVSLTALDNRAGLSPATWNFQSTYELLDYLLDQNRYEYIWGLYPTYTEWSVNTLKNLDLGYQAKPAPTAPPMPKAPELDLRGHTLESAEGRKIFGEYKKLQGEYEKAKMAVQAVEAYGEATRRVHDQLGQRYGKALGGMDAETKEVDGIIQEFAADLRTVYAKQADQATIDAETDKVAQRYKSQIDAVRLQAIQPGRVTLAKALSAALGADWRALGHSQLMQRWKDFKAGKRGAEIAFYPKEQTVLAAGAGLTLSNGAIHNGENAAQNRGRPEGLKIGAHSATSVSGSFDALRDVGLEKRLDIDDALAAVLVNPHIFTRIGAAQTSLADGAAGPALARQARQAPETDGMVDARGLGSADALASLASLDAAQGLEVSGRRNAQVADAGLAGPSAVAAPAVGAADVGVEPVTGDQVDQPVVAVGLEQPVATVRVAPPAVALPRPLFETRIKFIDQSKFYGSRYFFEQIGYKPDRAARVAGDNYFDTTLVREQVRRALGGYESRLPVRGVALVAKLMDSAGTVGKALGLKVGVAPTAQQLKQADRDFVWYVDTVIDGQKVLAPRLYLTEATRQGITDQYAGGGALIASGGDVTVNTDGHDVSSVNGLIQGRSVKVDAGKGKVVVADSKGAGGGIEADDEVDVSGRDIGIEGGKLRGKDVRLKADTVKVATSMRYDDKGRLAARGDGALDAQGGQLHIEAKRLETAGATLKGGKVKLDVDDVKLGGVYEAGSSYENKSSTPLGSLFAILSSTTETNQSAHANHYGTRIEAGTLEGKMQNLEIEGGSVDAAHTDLSVARDARFKAAADFAHAEHEKDVRQLSLGAKVGAGGYEAGFSLGSESGLEAHAGRGMTAGAEVKVGYRASHEQSSETEKSYRNANLNFGGGSVEAGNVLDIGGADINRNRYGGAAKGNAGTEEALRMRAKKVESTKYVSEQTSQSSGWSVEVASTASARSSLLTAATRLGDSVAQNVEDGREIRGELMAAQVAAEATQLVTADTAAVALSAGISADFDSSHSRSTSQNTQYLGGNLSIEATEGDATLVGAKFGGGDQVSLKAAKSVNLMAAESTFESYSESHNFHASADANLGANAVQGAVGLGLTAGMGTSHQITNETGKTYAGTSVDAANVSIDAGKDLNLSGSRVRGKHVVLDVEGDINATSKQDERNYNSSGGGWDASAGVAIQNRTLVAPVGSAGFNFNTEHDNSRLTNDGAAGVVASDGLTGHVKGDANLTGATIADLSGKGNLKVDGAVNAQNLKDYRDKDGGSGGLNVGISSTTLAPTVGVAFGRVAGEDYQAEQRATIDVGQTKDPARLQVGGGVKGTLNQDAAQATVVQRNKHWAGGGSEFSVAGKSLKKKNQVRPVETPTPDVVDGPPSRPTTPPASPQPIRATVEVSSPPPVSVATVEVVPRPKVETAQPLPPRPVAAQVVPVTPPKVEVAKVEVVPRPKVETAQPLPPRPVVAEKVTTPAVQPQLAKVETVQPVKPETTKPLPKPLPVAKVTKAPPPVVETAQPLPPVKPQKATPGPVAEVGKATVTTVQVQSAPPKPAPVAKQPAPAPKPKPKPKPKAERPKPGKTTPLSGRHVVQQQVQVLQRQASDINNTKSLPGGKLPKPVTVKLTDENGKPQTYTINRREDLMKLNGKVLSTKTTLGLEQTFRLRVEDIGGKNYRVFYETNK', 'MLQNTTQSNLPREPEAKQIDYNDSIRSTYFSIDDLRACGASLAEKGTSALPGFFPFEFRARHRENEKEILRVYRATAADVEAGASITPAAEWLLDNHHVVEEAIQEVRRDFPRRFYRQLPTLSVSGTVIPRTMALAWLYVAHTHSTVTRESITAMVEGFQEHETLKIGELWALPSILRFVLIENLRRIAIRVERSRGMRRKANEVADQLIRLNDPEGCRTLLVESEALAADNTFIAQLLYRMRDGSQSSGAVIAWIEERLERRGTDVEEALVAEQNRLSSGNATMSNIIRSLREIDDTDWAVWFESVSKIDATLREGSDYAALDFGSRNTYRDTIEKLARRSGHSEHEVTEIAIEMVEEAKAAAAVEAPLQEPNVGSFLVGKQRLALEKRIGYSPSIFQHLIRSVRKLDWFAIAGPNILLTILAMIVVYAFVSPMDIPSGAKLIMLLLFALPASEGAMGLFNTVFTLFAKPSRLVGYEFLDGIPEDARTLVVVPCLIAKRDHVDELVRNLEVHYLANPRGEIYFALLSDWADSKSEEAPADTDVLEYAKREIASLSARYAYDGKTRFFLLHRRRLYNEAEGVWMGWERKRGKLHELNLLLRGDRDTSFLQGANMVPEGVQYVMTLDSDTRLMRDAVTKLVGKLYHPINRPVVNPRTQEVVTGYSLLQPRVTPSLTTGSEASAFQRIFTINRGIDPYVFTVSDVYQDIAGEGSFTGKGLYHVDAFEAALKSRIEENAVLSHDLLEGSYARCALVTDIELVEDFPIRYEVEMSRQHRWARGDWQLLPYIFNPKNGLSMLGRWKMYDNLRRSLIPVAWLAASVMGWYYMEPTPALIWQLVLIFSLFVAPTLSLISGIMPRRNDIVARAHLHTVLSDIRAANAQVALRIVFIAHNAAMMADAIVRSLYRTFVSRKLMLEWRTAAQVQSAGHGSIGDYFRAMWTAPALALVSLALAAISDTGLPFIGLPFALIWAASPAVAWFVSQSAETEDQLVVSEEAIEEMRKIARRTWRYFEAFVTAEQNFLPPDNFQETPQPVLAERTSPTNIGVYLLSVMSARSFGWIGFEETITRLEQTIATIDRMPKYRGHLFNWYRTRGLEPMEPRYVSSVDSGNLAGHLIAVSSMCREWAEAPSAHVQGNLDGIGDVAAILKEALNELPDDRKTVRPLRRLVEERIAGFQNALAAVKRERELASIRVINLAVLARDMHKLTVNLDHEVRTVQSGEVATWAGSLVAACEAHIADGVFDLGAIEALRQRLLVLKERARDIAFSMDFSFLFRPERRLLSIGYRVNANELDEACYDLLASEARLTSLFAIAKGDLPTEHWYKLGRPIVPIGARGALVSWSGSMFEYLMPPLVMQERQGGILNQTNNLVVQEQINHGRRLGTPWGISEAAFNARDHELTYQYTNFGVPTLGLKRGLGQNAVIAPYASILACMYDPKSALANLARLREVGALGAYGYHDAVDFTPTRVPEGQKCAVVRNYYAHHHGMSVAAVANVVFNGQLREWFHADPVIEAAELLLQEKAPRDIPVMAAKREPEALGKGQADLLRPEVRVVEDPINQDRETVLLSNGHYSVMLTATGAGYARWNGQSVTRWTPDPVEDRTGTFIFLRDTVTGDWWSATAEPRRAPGEKTVTRFGDDKAEFVKTVGDLTSEVECIVATEHDAEGRRVILLNTGTEDRFIEVTSYAEPVLAMDDADSSHPTFSKMFLRTEISRHGDVIWVSRNKRSPGDPDIEVAHLVTDNAGSERHTQAETDRRRFLGQGRTLAEAAAFDPGATLSGTDGFTLDPIVSLRRVVRVPAGKKVSVIFWTIAAPDREGVDRAIDRYRHPETFNHELIHAWTRSQVQMRHVGITSKEAASFQMLGRYLVYPDMHLRADAETVKTGLASQSALWPLAISGDFPIFCLRINDDGDLGIAREALRAQEYLRARGITADLVVVNERASSYAQDLQHTLDSMCENLRLRGLSDGPRQHIFAVRRDLMEPETWSTLISASRAVFHARNGTISDQIARATSLYSKSSEKKEEGAEMLLPVIREADARTAVELDGGDLDFWNGFGGFAEDGREYAVRLRGGEATPQPWINVISNEQFGFHVSAEGAAFSWSRNSRDYQLTPWTNDAVVNRPGEAIFVRDMASGAVLTPYAALSRRKSALFETRHGLGYSRFLSTQDELEIEAMHTVHRTLPAKLVRLTIRNRSSAARKLRVYGYAEWVLGNNRSRTAPFVLSEWDESAKTLVATNPYSIDYPGRCAFFASDGDIAGYTASRREFLGRAGGILAPQAVISGAELTGSTDVDGDACAALATDITVEAGVERQVTFFLGDADNPDQVRAVLEELRADSFGAALEAAKAFWGDFTGVVKVETPDRAFNHMINHWLPYQALGCRIMARSAFYQASGAFGFRDQLQDTLAFLIHRPALARAQILNAAARQFVEGDVQHWWLPGTDAGVRTMISDDVVWLAHAVAHYCAVTGEEDILKEKVPFITGPALEEGQHDSFYKPDVADEVGDVYEHCARALDLAIHRTGANGLPLILGGDWNDGMNRVGEAGEGTSVWLGWFLAGTLRAFLPYARARKDKPRVALWERHLEALKDALEQAGWDGDYYRRGYYDDDTPLGSAENGECRIDSIAQSWSTLSGEGDKERSLRAMDAVMAELVDPEKRIVRLFTPPLETTKQDPGYIKAYPPGVRENGGQYTHAATWVVLAFAAQERAEEAWRTFRMLNPVSHALSQVDAEHYRVEPYVVAADIYGEGALAGRGGWTWYTGSAGWLYRAGVEGILGIRKRGDKLLIRPVLPSEWPGYSAEVRVNGTTHRISVSRDSKSGEPVVSVNNSVTKNAHEGVLL', 'MKKFKKKPKSIKRSHQNQKTILKRPLWLMPLLISGFASGVYANNLWDLLNPKVGGEYVHWVKGSQYCAWWEFAGCLKNVWGANHKGYDAGNAANYLSSQNYQAISVGSGNETGTYSLSGFTNYVGGNLTINLGNSVVLDLSGSNSFTSYQGYNQGKDDVTFTVGAINLNGTLEVGNRVGSGAGTHTGTATLNLNANKVNINSNINAYKTSQVNIGNANSVITIGSVSLSGDVCSSLASVGIGANCSTSGPSYSFKGTTNATNTAFSNASGSFTFEENATFSGAKWNGGTYTFNKEFSATNNTAFSSGSFNFKGVSSFNGTSFSNASYTFDNQATFQNSSFNGGTFTFNNQTNPTNNAQHPQIQNSSFSGNATTLKGFVNFQQAFNNSNHQLTIQNASFNNATFNNTGKITIEKDASFNNTTFNTSVDTNNMSVTGGVTLSGKNDLKNGSTLDFGSSKITLAQGTTFNLTSLGSEKSVTILNSSGGITYSNLLNHAINGLTSALKTNESLSNPQSFAQGLWDIITYNGVTGQLLNENAATSKPTDSSPSKSSTNSTQVYQVGYKIGDTIYKLQETFSHNSIIIQALESGTYTPPPVINGSKFDLSASNYINADMPWYDHKYYIPKSQNFTESGTYYLPSVQIWGSYTNSFKQTFSANGSNLVIGYNSTWTDHNVSSSGTVSFGDTSGSALNGHCGPWPYYQCTGTTNGTYSAYHVYITANLRSGNRIGTGGAANLIFNGVDSINIANATITQHNAGIYSSSMTFSTQSMDNSQNLNGLNSNGKLSVYGTTFTNEAKDGKFIFNAGQAVFENTNFNGGSYQFSGDSLNFSNNNQFNSGSFEISAKNASFNNANFNNSASFNFNNSNATTSFVGDFTNANSNLQIAGNAVFGNSTNGSQNTANFNNTGSVNISGNATFDNVVFNGPTNTSVKGQVTLNNITLKNLNAPLSFGDGTITFNAHSVINIAESITNGNPITLVSSSKEIEYNNAFSKNLWQLINYQGHGASSEKLVSSAGNGVYDVVYSFNNQTYNFQEVFSQNSISIRRLGVNMVFDYVDMEKSDHLYYQNALGFMTYMPNSYNNNLGNANNTIYYYDKSIDFYASGKTLFTKAEFSQTFTGQNSAIVFGAKSIWTSLSDAPQSNTIIRFGDNKGAGSNDASGHCWNLQCIGFITGHYEAQKIYITGSIESGNRISSGGGASLNFNGLQGILLTNATLYNRAAGTQSSSMNFISNSANIQAQNSYFIDDTAQNGGNPNFSFNALNLDFSNSSFRGYVGKTQSVFKFNAKNAISFTNSTNLSSGLYQMQAKSVLFDNSNLSVSVGTSSIKANAINLSQNASINASNHSTLELQGDLNVNDTSSLNLNQSTINVSNNATINDYASLIASNGSHLNFNGAVNFNSANITTSLNNSSIVFKGAVSLGGQFNLSNNSSLDFQGSSAITSNTAFNFYDNAFSQSPITFHQALDIKAPLSLGGNLLNPNNSSVLDLKNSQLVFGDQGSLNIANIDLLSDLNDNKNRVYNIIQADMNSNWYERISFFGMHINDGIYDAKNQTYSFTNPLNNALKITESFKDNQLSVTLSQIPGIKNTLYNIGSEIFNYQKVYNNANGVYSYSDDAQGVFYLTSNVKGYYNPNQSYQASGSNNTTKNNNLTSESSIISQTYNAQGNPISALHIYNKGYNFNNIKALGQMALKLYPEIKKVLGNDFSPSSLNALNSNALNQLTKLITPNDWKNINELIDNANNSVVQNFNNGTLIVGATQIGQTDTNSAVVFGGLGYQTPCDYTDIVCQKFRGTYLGQLLESSSADLGYIDTTFNAKEIYLTGTLGSGNAWGTGGSASVTFNSQTSLILNQANIVSSQTDGIFSMLGQEGINKVFNQAGLANILGEVAVQSINKAGGLGNLIVNTLGSNSVIGGYLTPEQKNQTLSQLLGQNNFDNLMNDSGLNTAIKDLIRQKLGFWTGLVGGLAGLGGIDLQNPEKLIGSMSINDLLSKKGLFNQITGFISANDIGQVISVMLQDIVKPSNALKNDVAALGKQMIGEFLGQDTLNSLESLLQNQQIKSVLDKVLAAKGLGPIYEQGLGDLIPNLGKKGLFAPYGLSQVWQKGDFSFNAQGNVFVQNSTFSNANGGTLSFNAGNSLIFAGNNHIAFTNHAGTLQLLSDQVSNINITTLNASNGLKINAANNNVSVSQGNLFVSASCAQQSDPTTANIANPCALSAQSTNGASSNNASNNAPIALSNNDESLMVAANDFNFSGNIYANGVVDFSKIKGSANIKNLYLYNNAQFQANNLTISNQAVLEKNASFVTNNLNIQGAFNNNATQKIEVLQNLVIASNASLSTGIYGLEVGGALNNSGAIHFNLENTQTPTPLIQAEGIINLNTTQTPFMNVNNSMANNTTYTLLKSSRYIDYNINPNSLQSYLNLYTLININGNHIEEKNGALTYLGQRVLLQDKGLLLSVALPNSNNASQNNILSLSVLYNQVKMSCGDKAMDFTPPTLQDYIVGIQGQSALNQIEAVGGNAIKWLSTLMMETKENPFFAPIYLKNHSLNEILGVTKDLQNTASLISNPNFRDNATNLLELASYTQQTSRLTKLSDFRSREGESDFSLLELKNKRFSDPNPEVFVKYSQLSKHPNNLWVQGVGGASFISGGNGTLYGLNAGYDRLVKNVILGGYVAYGYSDFNGNIMHSLGNNVDVGMYARAFLKRNEFTLSANETYGGNATSINSSNSLLSVLNQRYNYNTWTTSVNGNYGYDFMFKQKSVVLKPQVGLSYHFIGLSGMKGNDAAYKQFLMHSNPSNESVLTLNMGLESRKYFGKNSYYFVTARLGRDLLIKSKGSNTVRFVGENTLLYRKGEVFNTFASVITGGEMHLWRLVYVNAGVGLKMGLQYQDINITGNVGMRVAF', 'MVGKNNNYVRESKSNEHFQRFALRKLSVGVVSVAVAAGFYLGSGATAQAATTESNASAKTEQVVQQNSTSAASDSTSTSNSSAAVSTSSATPVSTESASSMTVSDLPASASAASDNQASAANASESSSQSASSSVASDAAATVSKDSQAASEANSQSAADVETVQLPTSAANANANESQAANILGAQAVQKAANQQAPAGFTVTDPNYPAEMYKDPDASHYTYWWAQSSNGEYNLVLSTDRNGDGKVYVFLLGNNNNVLGKYTVDKNKSTEVATDDEGDFGTVYNDGQSGVFVTSDGTWKSKFNVFDPKAGEDDGDYGSISFMIPQVETQTTTYVTYFDSKGNKVDKPIEVSDPVIQKGLDGQIYTTKGGKVINGYFAKEPKNAHGFMSPFGKQGAIYTKDWHDGLKATFTETDTKTGLMHVVVKHYYHSWGWGTWRTVKEFDLAPGQSEKVDYDVYKSVTIHSIYIPQTINIQYTYEKLGNLVISSDSKSFPAEDKTQYPNDKSDSTKAGNVTIPKVAGFTPTINDKTVTNYTFNPSDYVSDLSKDINVVYVADTQEAAISFYDETDHKPLNDQTIQLTGKTGEKISHTEANQTLAKLGKQGYVVDQNTFADDATYDNDTQAPQEFTIYLKHDTTHTDATSSKADQKTVSETIHYVYKDGVNANKPVADDANTTVTFKRGYTTDKVTGKIVSYDPWTVDGKQADSKTFDAVKSPVIAGYTADQAEVAAQTVTPDSQNINKTVYYTADTQEAAINFYDETGHKLLDNQTIHLTGKTGEKVDRTQADQTLADLVKQGYVLDKENTAKAFPADAVYDNNDQTPQEFTIYLKHGTTHTDATSSKADQKTVSETIHYVYKDGVNANKPVADDANTTVTFKRGYTTDKVTGKIVSYDPWTVDGKQADSKTFDAVKSPVIAGYTADQAEVAAQTVTPDSQNINKTVYYTADTQEAAINFYDETGHKLLDNQTIHLTGKTGEKVDRTQADQTLAELEKQGYVLDENNTKLGFPSNAAYDDDDVKPQEFTIYLKHGMTHTDATDKNAEQKIVTETIHYVYENNQTAKTDYTSAVDFKRGYTTDNVTHKIISYDPWMVSSKKFGFVKSPAIEGYTPNHSQIDEITVTPDSKDVVKTVVYVGNAQEAQAIFYDETTGKEISGTREIATGKTDETISFTKDPNEVVKELEKQGYVFDKDNAKNNVFVAGTAYDKNSEVHQYFKYYLKHGHATVTPDQDPQKGQKTVTQTIKYEYADGTATGLADNVQTLTFKRTGDKDLVTHEVTWPDWSTVAGQQTSVVTSPALKGYTADTNEIPAITYHAGDSDVTYVVKYNADVQHAVINYIDGESDEILHTDKVNGHSDEKINYSTADMIKQLEAKGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDTNDQFYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYANGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYAPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKNDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQKVHVQYIDGETDQMLRQDDLDGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKTDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQTAYVKYVDDTTGETLRQDDLHGYTDETIPYSTAEGIKKFEGDGYELFKDNFPAGEKFDNDDKTDQTYTVIFKHHRENVDPNHSSADGTKGTKTLTETVHYKYADGTKAAEDQTAQVTFTRNGVLDDVTGIVAWGKWNEASQSYKALTSPTIAGYTPSEAVVKRSSNSDAEQGPTLTVIYTADAQTAYVKYVDDTTGETLRQDDLHGYTDETIPYSTAEGIKKYEGDGYVLVSDGFKPGTKFGVGTPTYEVHFKHGMTHTDATDKNAEQKTVTETIHYVDENNQTVQPDSTTAVTFKRGYTTDNVTGKVVSYDPWTVDGNQADSKTFAAVPSPAVEGYTPNHQQINEFTVTPDSKDIVKTVVYVGDPQEAQAIFYDETTGKEISNTREIVNGKTDETIGFTKDPNEVVKELEKQGYVFDKDNANNNVFAAGTTYDKNSEVHQYFKYYFTHATTIVTPDNPKTPADVLPDNPGKNYPSGVAKDDLNKTVTRTINITTPDGKTQTITQKAEFTRSATVDEVTGEVTYGPWSKNVVLESVDVPNISGYVPSASVPEITVTPNDQDMTINITYKKLDSGKAADQGGNASNGGQATNGGSTTGQSAQNGQSGQTQNNAGAQQLPQTGNANNEKGALGLASAMFAAGLGLGFGSKKKCHED', 'MSRKERNFKRFFGQEKARVKLYKSGKQWVKAGIREVQLLKVLGLPFLNKDVEQINNLDTNKDKNFKNQAMKATGLAGGAFTFAMLNDHHAYAASETPMTSEIASNSETVANQNSTTVTKSETSTTEYISSQTSTSQDATSSTNSTEKSTSSSTTDSQTSTDSTSDKSTSNSEKQDSSMSNSDTKASSSSTTDNSTSNNSTTSEKDTNSQANTTSTDSQKGSTSTNDNSITSTSTKDNQIRKNSTESNSITASNSTSDSNSGSTVSTNSTTSQLTSTSESQINTDLGSTLLVSDSTSTSTSTAPLKLRTFSRLATTTFAAAAATSTTNTYTGAGTDTNYNIPIYYKLTTVNNGTSMTFTYTVTYDNPATTTVERPTALSNSYAIYNTGTTNQTMFTLGSAYGTPSTATSYITDSTGAQVSNPRANTTNINKQGSGYTWANGYQMNGAQAKQGYGLTTTWTVPINSSGDTSFTFNPYSTSVTGGTNFFNGKKVTVTDPTSTANSQSASTSTANSQSASTSKSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSASTSTSTSTANSQSTSTSTSTSTANSQSTSTSTSTSVSDSTSASTSLSGSTSTSVSDSTSASTSLSDSASTSVSDSTSASTSLSASTSTSESDSTSASTSLSESTSTSLSDSLSASTSLSDSASTSVSDSTSASTSLSGSESASLSDSASASTSLSESTSTSESTSTSESDSTSASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSGSESASLSDSASASTSLSESTSTSLSDSASASTSLSESTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSESTSTSLSDSASASTSLSESTSTSVSDSTSTSTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSDSASTSTSVSDSTSASTSDSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSLSESTSTSVSDSTSASTSLSDSASTSVSDSTSASTSLSESTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSASTSTSVSDSTSASTSLSGSTSTSESDSTSMSTSLSGSESTSLSDSLSASTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSTSISTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISRSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSMSTSLSGSESTSLSDSLSASTSVSASTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSVSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISGSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSGSESTSLSDSASASTSLSASTSTSVSDSTSTSTSDSVSTSTSMSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTNTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSNSASASTSLSGSASASLSDSLSASTSVSASTSTSVSDSTSTSTSLSESTSTSLSDSASASTSLSDSASTSVSDSTSASTSLSESTSTSVSDSTSTSTSLSGSESTSLSDSASASTSLSASTSTSVSDSTSTSTSDSVSTSTSMSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSVSTSLSASTSTSESDSTSTSTSLSGSTSTSVSDSISGSTSLSGSTSTSVSDSTSTSTSDSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSASTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSGSTSLSASTSTSVSDSTSTSTSLSASTSTSVSDSTSMSTSLSGSTSTSVSDSTSASTSLSGSTSTSVSDSTSTSTSLSASTSTSVSDSTSTSTSLSGSTSTSVSDSTSASTSSSESTSTSVSDSTSASVSTSISTSISMSESSSTSASTSDSTSTSASTSESRSASHSMSGTDSNNTSSSDSKSHSISNSDSNTTSDSASASTSISDSSSTSTSDSNASHSFSTSHSVSESNSMSTSHSQFDSISTSESMSGTDSTSLSTSLSHSASTSNSTSMTTSESQSNNDSQMHSNSLHHDAKDELPDTGDSDSNSTGLVSAVAAMLAGLGLFGKSRKNKKDKKNKGSEQ', 'MPSRSPSSARSSRALYAPRLKPLAQAIALLLVAGGAQAAGQPFSAAWFAAKGAAQGGAAGAPRPGAQLPGAPPPLAQQQRVNQQLQRSMANLNNTVAAIAAQQAAQAAGRQAALNLPQDVPDGLGEGGLKVDASLPFEQAWQNAKGPVQTQAAGKTTVSIEQTADKAVLNWETFNVGRNTTVDFQQHADWALLNRVNDPSARPSQIQGQIKADGTVMLVNRNGVVFSGSSQVDVRNLTVAAANISDEQFRQRGLYYDNAGSRPTFTDAAGAVRVEQGAQLRTAAPSGSTRGGGYVLLLGSEVDNAGSIVTPKGQTVLSAGDSFVIRRGQGTDGNLTSTTRGNEVLPGFAADSSAGRVRNSGLVQAATGDISLSGREVEQAGVLLSSSSVDSRGTLHLKASERITLAEGATSAILVDSSGSAALDSQREALLKPLNGSSAAVSRGDDDRRDLSRVEIDSAGSVDFRDGSITLASGGQVAVNAGQRALLRDGAVIDVSGAQGVQVAMETNSIKVNVRGNEQRDASVNREGGGLNSQDVWVDVRDLVRVPAGTNGYASDRWYTAGGLLEVGGYLGTQGHSAGEWMAQGGIVSFTGNDVVTQAGSQINLSGGTLDVQGGYIRQTWLKGSDGRLYELGSAPGDLLYDGIYRGYEAHSERWDQTRYFYNPLIAPTQRYENGYSVGRDAGSLVIGSANARLDGQVVGDTYRGERQTEAPQAGLDGYNQSQNAVARGAQLVVGRYTPYYVKSSGLLEYALGADAGSLKQVVIGTGEVAAEEPTLDAPVAAERQGRLSLDSELLNGFQLGGLKVAAGESIRVDSALTLANGGEAILFANDVAIDADITAHGGSLQAGNVLAQISPNGTIDGFVDAGREAGILRVGDGVRLDASGLWSNLLLAPEDNDTLAYRDGGRISLRSGGDLSLGQGSLLDVSSGAALLADGKRLGGRGGDIALHASAGLAQASDGQLQLGGTLNGLGTSGAGTLSLQSGKVRIGGDDLGDGSLQLAEDFFQQGFASYRVVGRSGLTVAEDAQVRVARPVYRFASGAGEVAAGEAPREALEAWIPPLYLEDALAGRLVQREGADLYLQAGGDGNILGQLDPASQTLELGRGSLVEVDPGRAIVLRGPGQITLDGILNAWGGRIDVRQQQFGALDVTQDNQPKAQGQPHARSIWIGEQALLDVAGRAVTALDGRGRRYGEVQSGGSIVIGGEIDPGKAIATSADAFVIVRPGARLEASGSQAQLDVPGLGRVLLAGDGGRIALSSYNGLYLDGSLRAAAGGSGAAGGSLEIIADAPLYQGFTVVDDRVLAMRELILTAGHADSGLPTLLQPGMDDSALRYGQSRVGTQSLTGGGFDQLSLFSNGPLSFEGNIDLAMGRSLNLYAGTIAATGGGPSEVKLQAPYVRLSGIGMYGQQASGEFRPRLTYGPTATAEQVRLQVSAGRLLDIAGRLSFGSDGVINGVNAEAVRYQRPGFEKVTLRSEGDLRFAGDYPENGDPSGRLITHGDLQLTAAQLYPVTGASSTLYAGYGLDEGGQAVFDAERHLAIERSGESLPDTPLSVFGSLAFMASNIEQGGVVRAPLGLIQFGSNLDRAPGTVRLLPGSLTSVSGAELVMPYGGTTDGINYLVNQVPIQLTGAGGALAAGTLVAGVGLYASEVDVQQGARLDLSGGGELAGAGFISGRGGSTDARFHPLVQQDNDGFRLPELSSNPVYAIVPGHQAVSAPLGGEAGAIQPLVGQQVTIGDGVPGLAAGTYTLLPSTYALLPGAFRVEINGLAGQGAPMATQGLRNGSWATSGQLSIAGTSIRDSLSRQVILSSADTLRRYSQYNEMSYADFIRADAARKNIPRAMLPVDARSLYLGLRADEELRENALSFEGKVDFTPEESGYGGSLIVDAEAGIEILPEGGLPDSDFAGVSLVADDLNAIGASRIAIGTLPYVEYGEQGNFVQFGGSNRLFPVVLRKGAHLSAPEVIIGRDITLEGGSGISTLGKGKTAYDSSDGFIYQPGGRNLLLLSNGWLNLLAPAADSSLPVRLGGCAEGAGCADTELYSEGTLGIATNGTVTFGDNVRYGTRNLSLALSTINIGSSQSLADAAARGVLPNGLALDQTVLQRLLRGERGAGIPALENLILSARDAVNIYGSVSLDTYDPATGKSSLANLVLGTPAIYGHGTGEDVASIRTASLVWSGSSQPAAAPVAGGAGSGSGTLRVDAERITLGYGANTQPAGETDEARLALGFAEVQLNASERISANHKGSLRVYQRLDGYVAGEGLRYSGGDLRLSTPLLTGEAGSLSRISSGGSLSLAAPAGAAAVTFDSGTAGLGAELSLSAREIRLDSAVSLPSGKLSLSAEDDLELGDGARIDLAGRKASFNDVDKYSWGGDLLLSSRAGDIRQAAGSLIDLSARNNRGGTLSAVALAEDAGVVDLQGRILGGASGDYDAGGTRVPFLGGELEIRAQRLGDGGSLSEQFTALNQRLNQGEVFGARRFQLKQGDLQIGDGLKAHRIEVSLDNGQLGVSGTVDASGAQVGEIRLAGGRGLSLGGNALLDAHGSLLRRDSYGQIIDSPNRAMVELSSGSGTLVLAGGARIDLRHGTAAPAEQVDGVARGTLELNAPRLGGVSAGDIAIDASGALDIRGAGSIALNAMQRYDDAPWGNDPAAGGRSYQVIDQAYLDARHAESSAFIAAALANRELLDGKLAGLTNATYADAFHLRPGVEIVSATADGDLVVQGDLDLSGYRYASLNPNTPLTEVYGSGEVGALVLRAGGDLNLYGSINDGFAPPPDSPDDKGWILTPGVQPFGGDLVVPGPGVVLGDGTAFLGGRTLNYDLPIKGTTLAAGTRLATEAVLEQPYTLAAGSVLVADIHDAAGTLLYAAGSLLRDGVTLEVGSRLGAGTLLAAPASVQAMTWPAGVPLPSILREGPSRPNVLLLNGELALARGSLIPSQTEVVLAGDAPFIELRPSDGVRQGRNWALAEMLPAGSQSWSMRLVAGADLAAADNRLVRPDSSASLNLADTHYQAKIEQSSGGLVFTDQATDWGITPGTPVDESNEWICGLGPYCAEPPRWTWAPGNYLGMPAGTAIGEGDLWWCSVDPSLCIENLGKTVVTPQNQLFSVLRTGTGDLDLASAGNLTQWSPYGVYTAGTQAADVATGFNQPRGLFNGSVLGAGGADYEVLSTSQYQAWYPEHGGNLDIAVGGDVVGDQWAEKLTSSDPIRPLPPSAAVGNWLWRQGSADREGVPTAWWVNFGSYVRGAEGDAPYLVGFTGFGTLGGGNLSMRTGGDAGNIAPRGDGSIPSSGNLNPRSQGLVLAVAGTGRLTSDGALQLGGGGDLNVRIGGEVNPSREARATQTYSSSGFDGLYSGGTIHDLQGALINLRGSASLYSGALGGIDPRYDTLLRDPAEVRSRDAFSPTLASSTGGLTLVAGDTGMRLETRGDLVLGGVTDPGRVGVPNTVGFTAPDGSVYQGGGIAWFSLWTAHTSIDLFAAGGNLTPSTQLVEATNAIPMAGRNLSPSDGRFIYPSIVRAAAPEGSIYLGPSSGDMGGVSLNVSTTPYSLLLAPSLNGELELLAGDSIYAGGYSVQRSGADPANLPSIWTPAFAGYSDAALLNPIAGNGSPDGNPAVIGGLPLFYFGPDSAASLARDLQPARFYALTGDIVGLNSGAQIRFGEQAGNRAGQTWYEGAGPVWMRAGRDIVASGTPLGQRISAPSQISTDASFTGNLFVHDDPNDLSLVQAGRDILYGNFNVAGPGTLEISAGRNILMEDRAAITSLGAVVPGDSRPGADIVLQAGAAGADYQAFLERYLDPANLAQAGTPLAEQPGKVVRTYESELAKWLNERFGFAGDAEQAQAFFAGLPAEQQRIFARQVYFAELRAGGREYNEVGGVRQGSYLRGRNAIAALFPERDPAGNPISYEGDIVMYGGAGVHTDFGGDIQLLSPGGRQVFGIEGEAPPSTAGIVTQGQGDIQAYSRDSILLGQSRIMTTFGGSILAWSAEGDINAGRGSQTTVVYTPPRRIYDAWGNVSLSPQVPSTGAGIATLNPIPEVAPGDIDLIAPLGTIDAGEAGIRVSGNVNVAALQVVNAANIQTQGQSSGIPLVASVNTGALTSASAAASSATQAAEDVSRQQQAAARQRMPSVITVQVLGFGNERLEPSRDGASRSPGYNPDSAVQVLGAGALGEQARSQLTDEERGNLIL', 'MDIRSPLNQCIALSLAGILFLNPIVAAAAGLALDKAAGGNTGLGQAGNGVPIVNIATPNDAGLSNNHFRDYNVGANGLILNNATGKTQGTQLGGIILGNPNLKGQAAQVILNQVTGGNRSTLAGYTEVAGQSARVIVANPHGITCQGCGFINTPRATLTTGKPIMDGQRLERFQVDGGDIVVEGAELNVGNLEQFDLITRSAKLNAKLYAKNLNIVTGRNDVQADSLQATPRAADGSEKPQLAIDSSALGGMYAGAIRLVGTEQGVGVRLAGDMAASGGDIRIDASGKLSLAQASSQGDLKIAAQAVELNGKTYAGGSAEIRSAEELVNRQSLAARERIVLEAAHIDNAGVIEAGVEPDERRNARGDLELRSGTLRNAGSLVASRALEAKASQALDNQGGSLKGATVRVDAGHLDNRGGKLLAEGELRVEASSLDNRQDGLLQSRDRAVVKTRGDLDNRGGQVIGLNDLEVGAATLDNGQQGLLGSQQSTRVSAQALVNRGDGEVSGKRVEARVGSLDNRGGKLIGDDLLVVASGAIDNRLGLFSAANRLDLRARSLDNSGKGTLSSRGGLEVSLGGLLDNRDEGNLLSQGAQRVTVGQLDNRAGGLLSSRSELNVHGASLDNRGGVLVADAGLSATGGAFDNRDGGSASGKAGVRVEVASLRNDQGGKLLSDGRLDLAANAVGNAGGRIAAKGDLQATLGSLAQQGGELVSEKTLKVAADTLDNSQSGLIAANGGIAIEARQVDNRAGEISSTSKVAVNAREQLDNRGGKVIGDSGLRLTVQRLLNQAKGVLAGRDGLSLDGGELFNGDGGRLDSQNSLSVSLGGVLDNQGGALVSEGSLTARAARLDNRGGTFSSAGALALTSQAALDNQGGRLLSDAGVTLQGASLDNSRSGVISAKGAVDIRTGVLDNSRNGGIGSNAGITLVAARLDNGQQGRVSAKGLLDANLKGLDQRGGGVLISETGVTLDLNGGTLVNRDGGLIATPGALLLRQLGAVDNGAGGEISSDRAFTLAAASLDNRGGRLIGAANLTLRIAQALDNSLAGVISGAAGLDIAAARLDNSAKGTLASRAGIDLRVDGALDNHAEGTVSGARLTLASASLDNSGKGLLSGNAGLSVATGALDNAEGGQLISQGVLDVSSADLDNRGGALSGKQSLRLSAANLDNRGGLLTSDGELELTAGRVDSADGGEISARGDLRLTVERLVQRQGRLVGERGVSLDLRGGDLDNQGGLISARGPLSIERLSVLDNRQGGEISSQQGFELLARRIDNGQQGRIISAGKLRLDADALGNAGAGLLSGWQGLTVTGGSLDNSAGGTLSSKDGELAISLGGALDNHGQGALVSKGAQRIDAASLDNAQGIVSGESDVTLSIAGKLDNGQGGLVSAQRALSFERDDTLLNNAGGRINGGSLLLKGASLDNSDGQLISQGRLDAILGGALVNTGAARLASGGDLLLRSASVDNRGGKLVSQGLLEISAGSLDNSASGTLASQAGMSLRLGGGALRNQQDGLIFSQAGALDVQAGSLDNRQGTLQAQGDNRLRIGGALDNQGGRLDSRAGNLDLQSGSLDNGAGGVLNSAKGWLKLVTGLFDNSAGVTQAQSLEIRAGQGVRNQQGHLSALGGDNRIVTADFDNQGGGLYASGLLSLDGQRFLNQGAAAGQGGKVGAGRIDFSLAGALANRFGQLESESELHLRAAAIDNSGGSLRALGRSGSTRLVAGGLNNAYGVLESANQDLDLQLGSLANAGGRILHTGNGTFGLDSGQVIRAGGELTTNGLLDIRASEWTNSSVLQAGRLNLDIGTFRQTAEGKLLAVQSFTGRGGDWSNDGLLASDGSFRLDLSGGYRGNGRATSLGDFALNAASLDLGNAASLAGGANVTLGAGNLLVNRGRITAAGDLVASAASLNNYGTLGGGGNLRLNAPALLNERGLLFSGADMTLRAGDITNLYGDVYSLGRLDIARDDAGNRAASLRNLSGVIESGKDFSLRASLIENRRAVLESKSGLYTAKMEQTACIEGVNAGDCSGKRNAIWTITQRDKTEVTASSAMGQLLAGGDFAIDGGTLNNLSSLIGSGGNLTANLEVLDNQGLETGELETIRVLRTARGGDIGGIDQKSRNFTNLYWYQSANFDPARAGEIPAALNAILSDWSFEYEFPSKGPTPISSGDQSYAAVIQAAGDVTVNASTRIDNGVTRPGYTFVGSGRQVGDSAVGGSGVSVVVPLTSQLPPDLARRQVNPVTLPGFSLPQGDNGLFRLSSRFAEDGNGSAALGAGADRTQGGSGVSVGQQGAGNAAGTWQGQGVRVDGLAGAANVQGQGGSTLGGSLPGVARVQGVPGNATPSASHKYLIETNPALTELKQFLNSDYLLSGLGMNPDDSKKRLGDGLYEQRLIRDAVVARTGQRYIDGLSSDEALFRYLMDNAIAYKDQLHLQLGVGLSAEQMAALTHDIVWLEEVEVNGEKVLAPVVYLAQAEGRLAPNGALIQGRDVKLVSGGDLHNVGTLRARNDLSATADNLDNSGLIEAGKRLDLLAGDSIRNRQGGVIAGRDVSLTALTGDVINERSVTRYDSALDGRTWERSFADSAARVEAANSLNVQAGRDIANLGGVLQSRGDLSLDAGRDVTVAAVEDRQGQTRWSTSRLQSVTQLGAEVSAGRDLNVSAGRDLTAVASTLEARRDIALSAGRDVTLAAAANEEHAYSKTRKVTYQEDKVAQQGTRVDAGGDLAINAGQDLRLIASQASAGDEAYLVAGDKLELLAANDSNYYLYDKKKKGDFGRKETRRDEVTDVKAVGSQISSGGDLTLLSGGDQTYQGAKLESGNDLAIVSGGAVTFEAVKDLHQESHEKSKGDLAWNSAKGKGQTDETLRQTQIVAQGNLAIKAVEGLKIDLKHIDQKTVSQTIDAMVQADPQLAWLKEAEQRGDVDWRMVQEVHDSWKYSNSGMGPATQIAVAIAAAAIGGMAAAGALSGAGVGASSFAMGAGVGAAGSLSGTAAVSLINNKGDLGKVLKDSFSSDSLKQIAIASLTGGLTAEYFDGILQTKTDPLTGKVTVDLSSLSGVGRFAANQAMQNATSTVLSQALGQGGSLNEALKSALYNSFAAAGFNFVGDIGQEYSLKPGDPSMVTMHALMGGLAAQVSGGDFATGAAAAGANEALVAKLDQAFKSLSPENREAMVTMGSQLVGVLAAAVRDPDVTGKALESAAWVAKNSTQYNFLNHQDVADLDNALQKCKSQGNCRQVEEEFKARSDENRRRLNGCVAVGNCAEIRAEIDAGSTALNELVARQETANPGGSDSDIAYGFLMGRNVVDWTTAGQLHLEQTANLWWNGNPQWQKEVGAYLDQTGFNPFGIGVPAMGGAAGKVTAKALMNALKAGELPKGEVAPGKANLPTIGALADAEAGMPYTHPVKLAAKATGTAGKIKIEAGAIPDANEVRAGQGLSGLGYDVTHQTTASAKGIQGQRTADLHVDGLGSIDVYTPKNLDPTKIVRAIEKKSNQAGGVLVQADLPSTDMSSIAARMWGKTNAQSIKTIFFQKPDGSLVRFDRPAGGG', 'MDIRSPLNQCIALSLAGILFLNPIVAAAAGLALDKAAGGNTGLGQAGNGVPIVNIATPNGAGLSNNHFRDYNVGANGLILNNATGKTQGTQLGGIILGNPNLKGQAAQVILNQVTGGNRSTLAGYTEVAGQSARVIVANPHGITCQGCGFINTPRATLTTGKPIMDGQRLERFQVDGGDIVVEGAELNVGNLEQFDLITRSAKLNAKLYAKNLNIVTGRNDVQADSLQATPRAADGSEKPQLAIDSSALGGMYAGAIRLVGTEQGVGVKLAGDMAASGGDIRIDASGKLSLAQASSQGDLKIAAQAVELNGKTYAGGSAEIRSAEELVNRQSLAARERIALEAAHIDNAGVIEAGVEPDERRNARGDLELRSGTLRNAGSLVASRALEAKASQALDNQGGSLKGATVRVDGGHLDNRGGKLLAEGELRVEASSLDNRQDGLLQSRDRAVVKTRGDLDNRGGQVVGLNELQVQAAALDNRSAGLLSSKGDMDIEFARLDNSAGGKLVSERRTLLKADRLDNRSGRIVAGQDLDLSSRLIDNRAGDISSTSRVVASAREQLDNRGGKIVGDSGLDITTPRMLNQDKGVLASRDGLRLSATELFNGAGGLLSSQKGIDVSLAGAFDNQAGSLDSRGFLTVKSAWLDNQGGTLSSAGALAVTSQGALNNQGGRLASDAGLSLSSASLDNSQAGAISGKGAVEIRTGNLNNSRKASIGSDAGLTLVAARVDNSQAGRIAAKGVIDADLQGLDQHDRGNLVSDTGITLDLNKGSLVNRAQGLIATPGTLLLRQLGVVDNSGGEISSDRAFTLATSALNNQGGRLLSGGALTLRIAQALDNSLEGIVSGAGGLDIQAFVLDNRSGSIGSKGAIDIGVTRLENDAGTLIAERGLKLVADEANSSKGRIAANGSLHAKVGTLSQKGGELTSQDSLTLDLGILNNNAGRIAGNQGVDITARQVDNSVGEIASQGVVALNLTEQLDNRGGKIVGDSGLGITAPHVLNQDKGVLASRDGLRLSATELFNGAGGLLSSQKGIDVSLAGAFDNQAGSLDSRGFLTVKSAWLDNQGGTLSSAGALAVTSQGALNNQGGRLASDAGLSLSSASLDNSQAGAISGKGAVEIRTGNLNNSRKASIGSDAGLTLVAARVDNSQAGRIAAKGAIDAALQGLDQHDRGSLVSDTGITLDLNKGSLVNRAQGLIATPGTLLLRQLGVVDNSGGEISSDRAFTLATSALNNQGGRLLSGGALTLRIAQALDNSLEGIVSGAGGLDIQAFVLDNRSGSIGSKGAIDIGVTRLENDAGTLIAERGLKLAADEANNSKGRIVAKDELRAKLGALVQNGGELTTQGALALDADKVDNGAGRIAGNRGVVIDARQVDNRAGEIASQGVATLNLTEQLDNRGGKVVADSGLGITAPRVLNQDKGVIASRDGLRLSGTELFNGNAGLLSSQRHIEVTLDGVLDNQGKGALLSDGTLTVSAGRIHNQDATLSSAGALRLSSQEAVDNRGGKLVTDSSLRLTSASLDNSRSGIISANAAAEIHTGVLNNSQKGNLGSNDGLGLIATEVDNSQEGRITAKGMIDANIKGLDQQGKGRLVSNAGIILDLNEGTLANGAQGLIATPGTLLLRQLGMVDNSGGEISSDRAFTLTTSALTNQGGRLRSGGVLTLRIAQALDNSLEGVLSGTGGLDIRALALDNRSGSIGSKGAVDIDVSRLENDDGDLLSEGRLKLTAERANSVRGRIAARGDLHASVTAFNQAGGELSSEGALMLEADSLDNRSGGLVSADGNLTVSARRIDNRAGEIASPGQVTLDVAEQLDNRGGKAIGDSGLRLAAPRVLNQDGGVLASRDGLRLNGAELFNGNGGLLSSQQSIDVILDGVLGNQAGSLSSQGRLSVKSGRLDNQGGAVSSAGTLSLSSQGALNNQGGRVVTDAGAVLRSASLDNSQGGIVSAKGAAEIRTGSLNNSQKGGIGSGAGLALVADLVDNSQNGRITAKGAIDANLKGLDQQGSGRLVSDTAIALDLRGGELVNRAQGLIATPGALLLRQLGVVDNSGGGEISSDRSFTLAATALSNRGGRVISGDSLTLRIAQALDNSLQGVLSASGGLDVAALVFDNHSGIVASKGDTHIGVNRLENEAGRVVSEGALDLTAKQVSSAKGRIAAKGDLQVTVGTLEQQGGELASQGTLTLDADSLDNRNGGLVSADGGVTAEARQIDNRGGEISSVAKVALAVREQLDNRGGKVIGDSELSLTVQRLLNQAKGVLASRDGLHLDGAELLNGDGGLLSSQRLVDVTLSGALDNQGSGALVSEESLTVKADQVNNQAGTFSSAGSLLVTSRGELNNQGGRLVTDAGATLNSTGFDNSRAGLVSAKGAVAIRTGALNNSQKGSIGGNTGVTLVAGLVDNGREGRISTKGTLDANLKGLLQQGGGSLVGERGVTLDLNGGTLDNHDLGLVSTPGALLLRQLGMVDNSVGGEISSDRAFTLAANTLNNQGGRLISSEALTLRIAKTLDNSLKGQVLATDGLAIESQVLDNRAGTIGSKGDARISVTSLDNAEQGSLVSEGRLELVADQVSNGNQGRIAARGVLEAAVGTLLQQGGELVSQGSLDLRADTLDNSQSGLIAANGGIAIEARQVDNRAGEISSTSKVAVNAREQLDNRGGKVIGDSGLRLTVQRLLNQAKGVLAGRDGLSLDGGELFNGDGGRLDSQNSLSVSLGGVLDNQGGALVSEGSLTARAARLDNRGGTFSSAGALALTSQAVLDNQGGRLLSDAGVTLKGASLDNSRSGVISAKGAVDIRTGVLDNSRNGGIGSNAGITLVAARLDNGQQGRVSAKGLLDANLKGLDQRGGGVLVSETGVTLDLNGGTLVNRDGGLIATPGALLLRQLGAVDNGAGGEISSDRAFTLAAASLDNRGGRLIGADSLTLRIAQALDNSLAGVISGAAGLDIAAARLDNSAKGTLASRAGIDLRVDGALDNHAEGTVSGARLTLASASLDNSGKGLLSGNAGLSVATGALDNAEGGQLISQGVLDVSSADLDNRGGALSGKQSLRLSAANLDNRGGLLTSDGELELTAGRVDSADGGEISARGDLRLTVERLVQRQGRLIGERGVSLDLRGGDLDNQGGLISARGPLSIERLNVLDNRQGGEIYSQQGFELLARRIDNGQQGRIISAGKLRLDADALGNAGAGLLSGWQGLTVTGGSLDNSAGGTLSSKDGELAISLGGALDNHGQGALVSKGAQRIDAASLDNAQGIVSGESDVTLSIAGKLDNGQGGLVSAQRALSFERDDTLLNNAGGRINGGSLLLKGASLDNSDGQLISQGRLDAILGGALVNAGAARLASGGDLLLRSASVDNRGGKLVSQGLLEISAGSLDNSASGTLASQADMSLRLGGGALRNQQDGLIFSQAGALEVQAGSLDNRQGTLQAQGDNRLRIGGALDNQAGRLDSRAGNLDLQSGSLDNGAGGVLNSAKGWLKLVTGLFDNSAGVTQAQSLEIRAGQGVRNQQGHLSALGGDNRIVTADFDNQGGGLYASGLLSLDGQRFLNQGAAAGQGGKVGAGRIDFSLAGALANRFGQLESESELHLRAAAIDNSGGSLRALGRSGSTRLVAGDLNNAYGVLESANQDLDLQLGSLANAGGRILHTGNGTFGLDSGQVIRAGGELTTNGLLDIRASEWTNSSVLQAGRLNLDIGTFRQTAEGKLLAVQSFTGRGGDWSNDGLLASNGSLRLELSGGYRGNGRATSLGDFALNAASLDLGNAASLAGGANVTLGAGNLLVNRGRITAAGDLVASAASLNNYGTLGGGGNLRLNAPALLNERGLLFSGADMTLRAGDITNLYGDVYSLGRLDIARDDAGGWANRLENISGNLESTGDMRFSVSSLLNRRETLEIEGDLQNSAIGVRCTGCQLSERWGKTRSSSELVWIREYKSTLGDSSAAASITAGRDLLVVGASLQNIASNISAVRDATLSLSNFENKGYALGEYAVRGVYSPPSKFGEELLMRILAYNAVNDPSYGEGYASTGGRLPNIHYFDKNFNEKVSPLEVIHGNGKNGGPGWHLYFGTLDVEYPDTDRWNKAIGRIPAPNYSSKKTDAIPDLLKGLAPLDELTINKGANSTVGAVVQAGGRVTVNAAESFNNSVLQGFQAVQETQLPHQDIAVSSTTSAVVTLKSQLPADLARQQINPLTLPGFSLPQGQNGLFRLASQGAQVNQASGALKSASDLTQSGHGVSVSAQTGSGASGWSTQARRVGDDRVTSLAGSAYQGRVAEAIDALRASAPISGDGGNTGRFQAGEHQATTGLGGLVEGNASGHSGNGVILADLRGGLPSFSSLPASDHVQGTVPGHDGNGTILANWQGAQATVQASPSTVRVEGVVSSPGGNGSILADLPAEQSSVQALPSAVRAQGSLPRLEERSALLAEPPVGQPALQTLPSVARVEGVPSNATPSNSHKYLIETNPALTELKQFLNSDYLLGGLGINPDDSKKRLGDGLYEQRLVREAIVQRTGQRFIAGLNSDEAMFRYLMDNAIASKDVLGLTPGVTLSAAQVAALTHDIVWLEEVEVNGEKVLAPVVYLAQAEGRLGPNGALIQGRDVNLITGGDLRNAGTLRAQNDLSATAGNIDNSGLIEAGNRLDLLASGSIRNDQGGIIAGREVSLSALTGDVINERTVTQHQSSYRGTGTTEAFADSAARIEAAQKLTVSAGRDVANIGGVIDSKGDLALQGGRDVLVSAAVAERGWTAGSQAYQTQTTQMGAEVVAGRDISVSAGRDISVVGSRIDARRDVTFEAGRDVGLVAAANEEHAYGKTKKVTFQDDKITQQATRVDAGGDLAINAGQDLRLVASQASAGDEAYLVAGDKLELLAANDSSYYLYDKKSKGSFGSKKTRRDEITDVTAVGSQISSGGDLTLLSGGDQTYQGAKLESGNDLAIVSGGAVTFEAVKDLHQESHEKSKGDLAWQSSKGKGQTDETVRQSQIVAQGNLAIKAVEGLKIDLKHIDQKTVSQTIDAMVQADPQLAWLKQMEQRGDVDWRRVQELHDSWKYSNSGLGVGAQLAIAIVVAYFTAGAASAALGSMAGVGAGSGSMMAAAGSTAMVQAGTAVGTAAAGWANAAGTAVAMGMASNGAISTINNRGNLGDVVKDVTSSDALRGYVVAGTTAGLTAGVYDKWTSTQTGTSTALPNTGAVAPAAGLGTWQGVGQFTSNQLLQNGTSVLLDRALGGKGSLGDALQNSLANAFAAYGFKLIGDTTHGVLDDGSLGKIGLHALMGGLAAEAVGGDFRTGALAAGVNEALVDSLAKQYASLPIDDKKGLLIMSSQLIGVLAASTQGDADAKSLQTGAWVAGNATQHNYLSHWQEEKKRQEVDGCKDKQLCKTGIEAKWAIISAQQDVGIVVGVGGGIGLSTAETAVGVYELVKNWRETYAALEQLATSPEFRQQFGDNYLKGLEERAAFLTQAYEDAGWQGSVTAGVEGGRFAAELVGVLTAVKGGAQITAKLPTAAKNLVNAIAESPVSGSMSSQLGAVGDLGRLGGGGKGYVDILSHEAKQHILYGDKPGSGGHLWPGQAGKTVFPQNWSADKIVHEVGDIATSPSTKWYAQTGTGGVYTSKGDPAKWVAYEVRDGVRMRVVYQPATGKVITAFPDNAPIPPYKPIK']\n",
|
| 421 |
-
"['CAC14227', 'P12255', 'P20471', 'A64556', 'AAF25576', 'Q4L9P0', 'Q9I5N6', 'Q9I791', 'Q9I120']\n"
|
| 422 |
-
]
|
| 423 |
-
}
|
| 424 |
-
],
|
| 425 |
-
"source": [
|
| 426 |
-
"sequences = list(cpu_sequences['sequence'])\n",
|
| 427 |
-
"print(sequences)\n",
|
| 428 |
-
"accession = list(cpu_sequences['id'])\n",
|
| 429 |
-
"print(accession)"
|
| 430 |
-
]
|
| 431 |
-
},
|
| 432 |
-
{
|
| 433 |
-
"cell_type": "code",
|
| 434 |
-
"execution_count": 8,
|
| 435 |
-
"id": "2a1832cb",
|
| 436 |
-
"metadata": {},
|
| 437 |
-
"outputs": [
|
| 438 |
-
{
|
| 439 |
-
"data": {
|
| 440 |
-
"application/vnd.jupyter.widget-view+json": {
|
| 441 |
-
"model_id": "5df74f5eb4e24f72b645d0bbc1dc5c36",
|
| 442 |
-
"version_major": 2,
|
| 443 |
-
"version_minor": 0
|
| 444 |
-
},
|
| 445 |
-
"text/plain": [
|
| 446 |
-
"Processing Sequences: 0%| | 0/9 [00:00<?, ?it/s]"
|
| 447 |
-
]
|
| 448 |
-
},
|
| 449 |
-
"metadata": {},
|
| 450 |
-
"output_type": "display_data"
|
| 451 |
-
}
|
| 452 |
-
],
|
| 453 |
-
"source": [
|
| 454 |
-
"# Setup device\n",
|
| 455 |
-
"device = torch.device('cpu')\n",
|
| 456 |
-
"\n",
|
| 457 |
-
"# Load tokenizer and model\n",
|
| 458 |
-
"tokenizer = T5Tokenizer.from_pretrained('Rostlab/ProstT5', do_lower_case=False)\n",
|
| 459 |
-
"model = T5EncoderModel.from_pretrained(\"Rostlab/ProstT5\").to(device)\n",
|
| 460 |
-
"model.full() if device == 'cpu' else model.half()\n",
|
| 461 |
-
"\n",
|
| 462 |
-
"# Clean sequences\n",
|
| 463 |
-
"sequences = [\" \".join(list(re.sub(r\"[UZOB]\", \"X\", s))) for s in sequences]\n",
|
| 464 |
-
"sequences = [ \"<AA2fold> \" + s for s in sequences]\n",
|
| 465 |
-
"\n",
|
| 466 |
-
"# Process each sequence individually\n",
|
| 467 |
-
"for i, (seq, acc_id) in enumerate(tqdm(zip(sequences, accession), total=len(sequences), desc=\"Processing Sequences\")):\n",
|
| 468 |
-
" try:\n",
|
| 469 |
-
" # Tokenize\n",
|
| 470 |
-
" ids = tokenizer(\n",
|
| 471 |
-
" seq,\n",
|
| 472 |
-
" add_special_tokens=True,\n",
|
| 473 |
-
" return_tensors='pt'\n",
|
| 474 |
-
" ).to(device)\n",
|
| 475 |
-
"\n",
|
| 476 |
-
" # Forward pass\n",
|
| 477 |
-
" with torch.no_grad():\n",
|
| 478 |
-
" embedding_repr = model(\n",
|
| 479 |
-
" ids.input_ids,\n",
|
| 480 |
-
" attention_mask=ids.attention_mask\n",
|
| 481 |
-
" )\n",
|
| 482 |
-
"\n",
|
| 483 |
-
" # Compute actual length (excluding prefix)\n",
|
| 484 |
-
" real_len = ids.attention_mask[0].sum().item() - 1\n",
|
| 485 |
-
"\n",
|
| 486 |
-
" # Extract and average embeddings\n",
|
| 487 |
-
" emb = embedding_repr.last_hidden_state[0, 1:real_len]\n",
|
| 488 |
-
" emb_avg = emb.mean(dim=0).cpu().numpy()\n",
|
| 489 |
-
"\n",
|
| 490 |
-
" # Save embedding using accession ID\n",
|
| 491 |
-
" np.save(os.path.join(path, f\"{acc_id}.npy\"), emb_avg)\n",
|
| 492 |
-
"\n",
|
| 493 |
-
"\n",
|
| 494 |
-
" # Cleanup\n",
|
| 495 |
-
" del ids, embedding_repr, emb, emb_avg\n",
|
| 496 |
-
" torch.cuda.empty_cache()\n",
|
| 497 |
-
" gc.collect()\n",
|
| 498 |
-
"\n",
|
| 499 |
-
" except RuntimeError as e:\n",
|
| 500 |
-
" print(f\"Error {e} mientras se procesaba {acc_id}\")\n",
|
| 501 |
-
"\n"
|
| 502 |
-
]
|
| 503 |
-
}
|
| 504 |
-
],
|
| 505 |
-
"metadata": {
|
| 506 |
-
"kernelspec": {
|
| 507 |
-
"display_name": "tesisEnv",
|
| 508 |
-
"language": "python",
|
| 509 |
-
"name": "python3"
|
| 510 |
-
},
|
| 511 |
-
"language_info": {
|
| 512 |
-
"codemirror_mode": {
|
| 513 |
-
"name": "ipython",
|
| 514 |
-
"version": 3
|
| 515 |
-
},
|
| 516 |
-
"file_extension": ".py",
|
| 517 |
-
"mimetype": "text/x-python",
|
| 518 |
-
"name": "python",
|
| 519 |
-
"nbconvert_exporter": "python",
|
| 520 |
-
"pygments_lexer": "ipython3",
|
| 521 |
-
"version": "3.10.16"
|
| 522 |
-
}
|
| 523 |
-
},
|
| 524 |
-
"nbformat": 4,
|
| 525 |
-
"nbformat_minor": 5
|
| 526 |
-
}
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1ccbc986f147709315a8cb774f4c8523f9da34af321c111781e15e2d8c30c5f1
|
| 3 |
+
size 56006
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
notebooks/__pycache__/my_utils.cpython-310.pyc
DELETED
|
Binary file (14.2 kB)
|
|
|
notebooks/hyperparamsRF.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
src/__init__.py
ADDED
|
File without changes
|
src/cli.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import tkinter as tk
|
| 2 |
+
from tkinter import Menu
|
| 3 |
+
from src.my_utils import predict_with_prost
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
def menu():
|
| 7 |
+
"""
|
| 8 |
+
Creates and displays the main GUI menu for the Protein Tools application using Tkinter.
|
| 9 |
+
|
| 10 |
+
The menu includes:
|
| 11 |
+
- A "File" menu with options for creating a new file, opening, closing, preferences (with sub-menu for keyboard shortcuts and color themes), and exiting the application.
|
| 12 |
+
- A "Help" menu with options for welcome and about dialogs.
|
| 13 |
+
- Two buttons below the menu: one for loading a FASTA file (triggers `predict_with_prost`), and one for exiting the application.
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
None
|
| 17 |
+
"""
|
| 18 |
+
# root window
|
| 19 |
+
root = tk.Tk()
|
| 20 |
+
root.geometry('320x200')
|
| 21 |
+
root.title('Protein Tools Menu')
|
| 22 |
+
|
| 23 |
+
# create a menubar
|
| 24 |
+
menubar = Menu(root)
|
| 25 |
+
root.config(menu=menubar)
|
| 26 |
+
|
| 27 |
+
# create the file_menu
|
| 28 |
+
file_menu = Menu(menubar, tearoff=0)
|
| 29 |
+
file_menu.add_command(label='New')
|
| 30 |
+
file_menu.add_command(label='Open...')
|
| 31 |
+
file_menu.add_command(label='Close')
|
| 32 |
+
file_menu.add_separator()
|
| 33 |
+
|
| 34 |
+
sub_menu = Menu(file_menu, tearoff=0)
|
| 35 |
+
sub_menu.add_command(label='Keyboard Shortcuts')
|
| 36 |
+
sub_menu.add_command(label='Color Themes')
|
| 37 |
+
|
| 38 |
+
file_menu.add_cascade(label="Preferences", menu=sub_menu)
|
| 39 |
+
file_menu.add_separator()
|
| 40 |
+
file_menu.add_command(label='Exit', command=root.destroy)
|
| 41 |
+
menubar.add_cascade(label="File", menu=file_menu, underline=0)
|
| 42 |
+
|
| 43 |
+
# help menu
|
| 44 |
+
help_menu = Menu(menubar, tearoff=0)
|
| 45 |
+
help_menu.add_command(label='Welcome')
|
| 46 |
+
help_menu.add_command(label='About...')
|
| 47 |
+
menubar.add_cascade(label="Help", menu=help_menu, underline=0)
|
| 48 |
+
|
| 49 |
+
# =========================
|
| 50 |
+
# Add Buttons Below Menu
|
| 51 |
+
# =========================
|
| 52 |
+
|
| 53 |
+
btn_fasta = tk.Button(root, text="Load FASTA", command=predict_with_prost)
|
| 54 |
+
btn_fasta.pack(pady=5)
|
| 55 |
+
|
| 56 |
+
btn_exit = tk.Button(root, text="Exit", command=root.quit)
|
| 57 |
+
btn_exit.pack(pady=5)
|
| 58 |
+
|
| 59 |
+
root.mainloop()
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
menu()
|
{notebooks → src}/my_utils.py
RENAMED
|
@@ -5,7 +5,9 @@ from pprint import pprint
|
|
| 5 |
from io import StringIO
|
| 6 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 7 |
from urllib.error import HTTPError
|
| 8 |
-
from typing import Literal
|
|
|
|
|
|
|
| 9 |
|
| 10 |
|
| 11 |
import pandas as pd
|
|
@@ -27,7 +29,7 @@ import umap
|
|
| 27 |
import requests
|
| 28 |
from Bio import Entrez
|
| 29 |
from Bio import SeqIO
|
| 30 |
-
from tqdm
|
| 31 |
|
| 32 |
# Visualization libraries
|
| 33 |
import seaborn as sns
|
|
@@ -36,7 +38,9 @@ import plotly.express as px
|
|
| 36 |
|
| 37 |
from esm.models.esmc import ESMC
|
| 38 |
from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
|
| 39 |
-
from transformers import T5Tokenizer, T5EncoderModel
|
|
|
|
|
|
|
| 40 |
|
| 41 |
import torch
|
| 42 |
import gc
|
|
@@ -44,7 +48,18 @@ import gc
|
|
| 44 |
|
| 45 |
|
| 46 |
# Load one chunk of embeddings
|
| 47 |
-
def load_emb(path: str, acc: list[str])->list[np.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
X = []
|
| 49 |
for a in tqdm(acc, desc = 'Cargando embeddings'):
|
| 50 |
emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
|
|
@@ -59,7 +74,14 @@ def load_emb(path: str, acc: list[str])->list[np.array]:
|
|
| 59 |
X.append(emb)
|
| 60 |
return X
|
| 61 |
|
| 62 |
-
def confusion(title : str, y_true: np.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
|
| 64 |
cm = confusion_matrix(y_true = y_true,
|
| 65 |
y_pred = y_pred,
|
|
@@ -77,27 +99,46 @@ def confusion(title : str, y_true: np.array, y_pred: np.array) -> None:
|
|
| 77 |
plt.show()
|
| 78 |
|
| 79 |
def perplexity(X):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 80 |
X_array = np.vstack(X)
|
| 81 |
perp= np.arange(5, 55, 5)
|
| 82 |
divergence = []
|
| 83 |
|
| 84 |
for i in perp:
|
| 85 |
model = TSNE(n_components=2, init="pca", perplexity=i)
|
| 86 |
-
reduced = model.fit_transform(X_array)
|
| 87 |
divergence.append(model.kl_divergence_)
|
| 88 |
fig = px.line(x=perp, y=divergence, markers=True)
|
| 89 |
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
|
| 90 |
fig.update_traces(line_color="red", line_width=1)
|
| 91 |
fig.show()
|
| 92 |
|
| 93 |
-
def plot_umap(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
reducer = umap.UMAP(n_neighbors=30, random_state=42)
|
| 95 |
-
|
| 96 |
|
| 97 |
-
|
| 98 |
-
embedding = reducer.fit_transform(
|
|
|
|
| 99 |
|
| 100 |
-
fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=y, hover_data=
|
| 101 |
fig.update_layout(
|
| 102 |
title=title,
|
| 103 |
xaxis_title="First UMAP",
|
|
@@ -106,7 +147,7 @@ def plot_umap(X: list[np.array], y: list[str], title: str, org : list[str]) -> N
|
|
| 106 |
fig.show()
|
| 107 |
|
| 108 |
|
| 109 |
-
def plot_PCA(X: np.
|
| 110 |
X_array = np.vstack(X)
|
| 111 |
pca = PCA(n_components=2, random_state=42)
|
| 112 |
|
|
@@ -133,22 +174,33 @@ def plot_PCA(X: np.array, labels: list[str], title: str, org : list[str], scale:
|
|
| 133 |
fig.show()
|
| 134 |
|
| 135 |
|
| 136 |
-
def tsne_plot(X, y, org
|
| 137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
tsne = TSNE(n_components=2, perplexity=60, random_state=42)
|
| 139 |
tsne_fit = tsne.fit_transform(X_array)
|
| 140 |
|
| 141 |
-
fig = px.scatter(x=tsne_fit[:, 0], y=tsne_fit[:, 1], color=y, hover_data=
|
| 142 |
fig.update_layout(
|
| 143 |
title="t-SNE",
|
| 144 |
xaxis_title="First t-SNE",
|
| 145 |
-
yaxis_title="Second t-SNE"
|
| 146 |
)
|
| 147 |
-
|
| 148 |
fig.show()
|
| 149 |
-
|
| 150 |
-
|
| 151 |
def plot_emb(X, y, model_name, org : list[str]):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
print(f"Plotting embeddings for: {model_name}")
|
| 153 |
plot_PCA(X, y, title="PCA", scale=True, org = org)
|
| 154 |
tsne_plot(X, y,org = org)
|
|
@@ -225,7 +277,7 @@ def train_svm(title : str, X: np.ndarray, y: np.ndarray, params:dict) -> tuple[P
|
|
| 225 |
return pipeline, evaluation
|
| 226 |
|
| 227 |
|
| 228 |
-
def randomSVM(X: np.
|
| 229 |
|
| 230 |
X_train, _, y_train, _ = train_test_split(X,
|
| 231 |
y,
|
|
@@ -336,8 +388,24 @@ def gridSearch(X: np.ndarray, y: np.ndarray, grid: dict):
|
|
| 336 |
|
| 337 |
|
| 338 |
def fetch_uniprot_sequence(uniprot_id: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 339 |
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
|
| 340 |
-
response = requests.get(url)
|
| 341 |
|
| 342 |
if response.status_code == 200:
|
| 343 |
try:
|
|
@@ -346,10 +414,10 @@ def fetch_uniprot_sequence(uniprot_id: str):
|
|
| 346 |
record = SeqIO.read(fasta_io, "fasta")
|
| 347 |
return str(record.seq)
|
| 348 |
|
| 349 |
-
except
|
| 350 |
# fallback to UniSave if the standard endpoint is not available
|
| 351 |
url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
|
| 352 |
-
response = requests.get(url)
|
| 353 |
|
| 354 |
if response.status_code == 200:
|
| 355 |
try:
|
|
@@ -358,7 +426,7 @@ def fetch_uniprot_sequence(uniprot_id: str):
|
|
| 358 |
fasta_io = StringIO(entries[1])
|
| 359 |
record = SeqIO.read(fasta_io, "fasta")
|
| 360 |
return str(record.seq)
|
| 361 |
-
except
|
| 362 |
print(f'No se pudo obtener la entrada FASTA para {uniprot_id} desde UniSave')
|
| 363 |
else:
|
| 364 |
print(f'UniSave URL inválido: {url}')
|
|
@@ -372,11 +440,13 @@ def fetch_refseq_sequence(refseq_id : str):
|
|
| 372 |
"""
|
| 373 |
|
| 374 |
Entrez.email = "puglia.jd@gmail.com" # REQUIRED
|
| 375 |
-
Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
|
| 376 |
# Check if the ID is NaN or None
|
| 377 |
if pd.isna(refseq_id) or refseq_id is None:
|
| 378 |
return None
|
| 379 |
-
|
|
|
|
|
|
|
| 380 |
try:
|
| 381 |
handle = Entrez.efetch(
|
| 382 |
db="protein",
|
|
@@ -434,7 +504,9 @@ def _fetch_sequence_for_row(idx, row):
|
|
| 434 |
return idx, sequence
|
| 435 |
|
| 436 |
|
| 437 |
-
|
|
|
|
|
|
|
| 438 |
"""
|
| 439 |
Add a 'sequence' column to the dataframe by fetching sequences from
|
| 440 |
SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
|
|
@@ -481,7 +553,7 @@ def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: int = None, max_
|
|
| 481 |
f"({round(success_count/total_rows*100, 2)}%)")
|
| 482 |
return result_df
|
| 483 |
|
| 484 |
-
def esm_embed_sequence(model : Literal["esmc_300m", "esmc_600m"], sequence : str, device : str) ->
|
| 485 |
|
| 486 |
"""
|
| 487 |
Embed a protein sequence using the specified ESM model.
|
|
@@ -524,7 +596,9 @@ def esm_save_emb(model: Literal["esmc_300m", "esmc_600m"],
|
|
| 524 |
assert len(seq_list) == len(id_list), "Sequence and ID lists must be the same length."
|
| 525 |
os.makedirs(path, exist_ok=True)
|
| 526 |
|
| 527 |
-
for i, (seq, acc) in enumerate(
|
|
|
|
|
|
|
| 528 |
try:
|
| 529 |
output: LogitsOutput = esm_embed_sequence(model=model, sequence=seq, device = device)
|
| 530 |
emb_array = output.embeddings.cpu().numpy()
|
|
@@ -543,65 +617,218 @@ def esm_save_emb(model: Literal["esmc_300m", "esmc_600m"],
|
|
| 543 |
gc.collect()
|
| 544 |
torch.cuda.empty_cache()
|
| 545 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 546 |
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
|
|
|
|
|
|
| 551 |
|
| 552 |
-
|
|
|
|
| 553 |
|
| 554 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 555 |
Args:
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 559 |
|
|
|
|
|
|
|
|
|
|
| 560 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
|
| 562 |
-
|
|
|
|
|
|
|
| 563 |
|
| 564 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 565 |
|
| 566 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 567 |
|
| 568 |
-
|
| 569 |
-
|
| 570 |
-
|
| 571 |
-
|
|
|
|
| 572 |
|
| 573 |
-
|
| 574 |
-
|
| 575 |
-
|
| 576 |
-
ids = tokenizer(
|
| 577 |
-
seq,
|
| 578 |
-
add_special_tokens=True,
|
| 579 |
-
return_tensors='pt'
|
| 580 |
-
).to(device)
|
| 581 |
-
|
| 582 |
-
# Forward pass
|
| 583 |
-
with torch.no_grad():
|
| 584 |
-
embedding_repr = model(
|
| 585 |
-
ids.input_ids,
|
| 586 |
-
attention_mask=ids.attention_mask
|
| 587 |
-
)
|
| 588 |
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
|
|
|
| 593 |
|
| 594 |
-
|
| 595 |
-
|
|
|
|
| 596 |
|
| 597 |
-
|
| 598 |
|
| 599 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 600 |
|
| 601 |
-
|
| 602 |
-
|
| 603 |
|
| 604 |
-
|
| 605 |
-
|
| 606 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 607 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
from io import StringIO
|
| 6 |
from concurrent.futures import ThreadPoolExecutor, as_completed
|
| 7 |
from urllib.error import HTTPError
|
| 8 |
+
from typing import Literal, Optional
|
| 9 |
+
import tkinter as tk
|
| 10 |
+
from tkinter import filedialog
|
| 11 |
|
| 12 |
|
| 13 |
import pandas as pd
|
|
|
|
| 29 |
import requests
|
| 30 |
from Bio import Entrez
|
| 31 |
from Bio import SeqIO
|
| 32 |
+
from tqdm import tqdm
|
| 33 |
|
| 34 |
# Visualization libraries
|
| 35 |
import seaborn as sns
|
|
|
|
| 38 |
|
| 39 |
from esm.models.esmc import ESMC
|
| 40 |
from esm.sdk.api import ESMProtein, LogitsConfig, ESMProteinError, LogitsOutput
|
| 41 |
+
from transformers import T5Tokenizer, T5EncoderModel, PreTrainedModel
|
| 42 |
+
|
| 43 |
+
from joblib import load
|
| 44 |
|
| 45 |
import torch
|
| 46 |
import gc
|
|
|
|
| 48 |
|
| 49 |
|
| 50 |
# Load one chunk of embeddings
|
| 51 |
+
def load_emb(path: str, acc: list[str])->list[np.ndarray]:
|
| 52 |
+
|
| 53 |
+
""" Load embeddings from a specified path.
|
| 54 |
+
Args:
|
| 55 |
+
path (str): Directory where embeddings are stored.
|
| 56 |
+
acc (list[str]): List of accession IDs corresponding to the embeddings.
|
| 57 |
+
Returns:
|
| 58 |
+
list[np.ndarray]: List of loaded embeddings as numpy arrays.
|
| 59 |
+
"""
|
| 60 |
+
if not os.path.exists(path):
|
| 61 |
+
raise FileNotFoundError(f"The specified path does not exist: {path}")
|
| 62 |
+
|
| 63 |
X = []
|
| 64 |
for a in tqdm(acc, desc = 'Cargando embeddings'):
|
| 65 |
emb : np.ndarray = np.load(os.path.join(path, f"{a}.npy"))
|
|
|
|
| 74 |
X.append(emb)
|
| 75 |
return X
|
| 76 |
|
| 77 |
+
def confusion(title : str, y_true: np.ndarray, y_pred: np.ndarray) -> None:
|
| 78 |
+
|
| 79 |
+
""" Plot a confusion matrix for the given true and predicted labels.
|
| 80 |
+
Args:
|
| 81 |
+
title (str): Title for the confusion matrix plot.
|
| 82 |
+
y_true (np.ndarray): True labels.
|
| 83 |
+
y_pred (np.ndarray): Predicted labels.
|
| 84 |
+
"""
|
| 85 |
|
| 86 |
cm = confusion_matrix(y_true = y_true,
|
| 87 |
y_pred = y_pred,
|
|
|
|
| 99 |
plt.show()
|
| 100 |
|
| 101 |
def perplexity(X):
|
| 102 |
+
|
| 103 |
+
"""
|
| 104 |
+
Plot the KL divergence for different perplexity values in t-SNE.
|
| 105 |
+
Args:
|
| 106 |
+
X (list[np.ndarray]): List of feature arrays to be reduced.
|
| 107 |
+
"""
|
| 108 |
+
|
| 109 |
X_array = np.vstack(X)
|
| 110 |
perp= np.arange(5, 55, 5)
|
| 111 |
divergence = []
|
| 112 |
|
| 113 |
for i in perp:
|
| 114 |
model = TSNE(n_components=2, init="pca", perplexity=i)
|
|
|
|
| 115 |
divergence.append(model.kl_divergence_)
|
| 116 |
fig = px.line(x=perp, y=divergence, markers=True)
|
| 117 |
fig.update_layout(xaxis_title="Perplexity Values", yaxis_title="Divergence")
|
| 118 |
fig.update_traces(line_color="red", line_width=1)
|
| 119 |
fig.show()
|
| 120 |
|
| 121 |
+
def plot_umap(x: list[np.ndarray], y: list[str], title: str, org: list[str]) -> None:
|
| 122 |
+
"""
|
| 123 |
+
Plot a 2D UMAP projection of high-dimensional data with color-coded labels and hover information.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
x (list[np.ndarray]): List of feature arrays to be concatenated and visualized.
|
| 127 |
+
y (list[str]): List of labels corresponding to each sample in x, used for coloring the scatter plot.
|
| 128 |
+
title (str): Title of the plot.
|
| 129 |
+
org (list[str]): List of organism or group identifiers for each sample, shown in hover data.
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
None: Displays an interactive UMAP scatter plot using Plotly.
|
| 133 |
+
"""
|
| 134 |
reducer = umap.UMAP(n_neighbors=30, random_state=42)
|
| 135 |
+
x_array = np.vstack(x)
|
| 136 |
|
| 137 |
+
scaled_x = StandardScaler().fit_transform(x_array)
|
| 138 |
+
embedding = reducer.fit_transform(scaled_x)
|
| 139 |
+
embedding = np.array(embedding) # Ensure it's a NumPy array for slicing
|
| 140 |
|
| 141 |
+
fig = px.scatter(x=embedding[:, 0], y=embedding[:, 1], color=y, hover_data=[org, y])
|
| 142 |
fig.update_layout(
|
| 143 |
title=title,
|
| 144 |
xaxis_title="First UMAP",
|
|
|
|
| 147 |
fig.show()
|
| 148 |
|
| 149 |
|
| 150 |
+
def plot_PCA(X: np.ndarray, labels: list[str], title: str, org : list[str], scale: bool) -> None:
|
| 151 |
X_array = np.vstack(X)
|
| 152 |
pca = PCA(n_components=2, random_state=42)
|
| 153 |
|
|
|
|
| 174 |
fig.show()
|
| 175 |
|
| 176 |
|
| 177 |
+
def tsne_plot(X, y, org: list[str]) -> None:
|
| 178 |
+
# If X is a list of arrays, stack them; if already ndarray, use as is
|
| 179 |
+
if isinstance(X, list):
|
| 180 |
+
X_array = np.vstack(X)
|
| 181 |
+
else:
|
| 182 |
+
X_array = X
|
| 183 |
+
X_array = StandardScaler().fit_transform(X_array)
|
| 184 |
tsne = TSNE(n_components=2, perplexity=60, random_state=42)
|
| 185 |
tsne_fit = tsne.fit_transform(X_array)
|
| 186 |
|
| 187 |
+
fig = px.scatter(x=tsne_fit[:, 0], y=tsne_fit[:, 1], color=y, hover_data=[org, y])
|
| 188 |
fig.update_layout(
|
| 189 |
title="t-SNE",
|
| 190 |
xaxis_title="First t-SNE",
|
| 191 |
+
yaxis_title="Second t-SNE"
|
| 192 |
)
|
|
|
|
| 193 |
fig.show()
|
|
|
|
|
|
|
| 194 |
def plot_emb(X, y, model_name, org : list[str]):
|
| 195 |
+
|
| 196 |
+
""" Plot embeddings using PCA, t-SNE, and UMAP.
|
| 197 |
+
Args:
|
| 198 |
+
X (list[np.ndarray]): List of feature arrays to be concatenated and visualized.
|
| 199 |
+
y (list[str]): List of labels corresponding to each sample in X, used for coloring the scatter plot.
|
| 200 |
+
model_name (str): Name of the model used for generating embeddings.
|
| 201 |
+
org (list[str]): List of organism or group identifiers for each sample, shown in hover data.
|
| 202 |
+
"""
|
| 203 |
+
|
| 204 |
print(f"Plotting embeddings for: {model_name}")
|
| 205 |
plot_PCA(X, y, title="PCA", scale=True, org = org)
|
| 206 |
tsne_plot(X, y,org = org)
|
|
|
|
| 277 |
return pipeline, evaluation
|
| 278 |
|
| 279 |
|
| 280 |
+
def randomSVM(X: np.ndarray, y = np.ndarray) -> dict:
|
| 281 |
|
| 282 |
X_train, _, y_train, _ = train_test_split(X,
|
| 283 |
y,
|
|
|
|
| 388 |
|
| 389 |
|
| 390 |
def fetch_uniprot_sequence(uniprot_id: str):
|
| 391 |
+
|
| 392 |
+
"""
|
| 393 |
+
Fetch the protein sequence for the given UniProt ID.
|
| 394 |
+
Returns the raw amino-acid sequence as a string.
|
| 395 |
+
Args:
|
| 396 |
+
uniprot_id: UniProt ID to fetch the sequence for.
|
| 397 |
+
Returns:
|
| 398 |
+
str: Amino-acid sequence in FASTA format.
|
| 399 |
+
Raises:
|
| 400 |
+
HTTPError: If the request to UniProt fails.
|
| 401 |
+
Note:
|
| 402 |
+
This function first tries to fetch the sequence from the standard UniProt endpoint.
|
| 403 |
+
If that fails, it falls back to the UniSave endpoint.
|
| 404 |
+
If both fail, it returns None and prints an error message.
|
| 405 |
+
"""
|
| 406 |
+
|
| 407 |
url = f"https://rest.uniprot.org/uniprotkb/{uniprot_id}.fasta"
|
| 408 |
+
response = requests.get(url, timeout=10)
|
| 409 |
|
| 410 |
if response.status_code == 200:
|
| 411 |
try:
|
|
|
|
| 414 |
record = SeqIO.read(fasta_io, "fasta")
|
| 415 |
return str(record.seq)
|
| 416 |
|
| 417 |
+
except ValueError:
|
| 418 |
# fallback to UniSave if the standard endpoint is not available
|
| 419 |
url = f"https://rest.uniprot.org/unisave/{uniprot_id}.fasta"
|
| 420 |
+
response = requests.get(url, timeout=10)
|
| 421 |
|
| 422 |
if response.status_code == 200:
|
| 423 |
try:
|
|
|
|
| 426 |
fasta_io = StringIO(entries[1])
|
| 427 |
record = SeqIO.read(fasta_io, "fasta")
|
| 428 |
return str(record.seq)
|
| 429 |
+
except ValueError:
|
| 430 |
print(f'No se pudo obtener la entrada FASTA para {uniprot_id} desde UniSave')
|
| 431 |
else:
|
| 432 |
print(f'UniSave URL inválido: {url}')
|
|
|
|
| 440 |
"""
|
| 441 |
|
| 442 |
Entrez.email = "puglia.jd@gmail.com" # REQUIRED
|
| 443 |
+
Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
|
| 444 |
# Check if the ID is NaN or None
|
| 445 |
if pd.isna(refseq_id) or refseq_id is None:
|
| 446 |
return None
|
| 447 |
+
|
| 448 |
+
fasta_data = None
|
| 449 |
+
|
| 450 |
try:
|
| 451 |
handle = Entrez.efetch(
|
| 452 |
db="protein",
|
|
|
|
| 504 |
return idx, sequence
|
| 505 |
|
| 506 |
|
| 507 |
+
|
| 508 |
+
|
| 509 |
+
def fetch_sequences_for_dataframe(df: pd.DataFrame, batch_size: Optional[int] = None, max_workers: int = 5) -> pd.DataFrame:
|
| 510 |
"""
|
| 511 |
Add a 'sequence' column to the dataframe by fetching sequences from
|
| 512 |
SwissProt or RefSeq based on available IDs, with parallel execution and a progress bar.
|
|
|
|
| 553 |
f"({round(success_count/total_rows*100, 2)}%)")
|
| 554 |
return result_df
|
| 555 |
|
| 556 |
+
def esm_embed_sequence(model : Literal["esmc_300m", "esmc_600m"], sequence : str, device : str) -> LogitsOutput:
|
| 557 |
|
| 558 |
"""
|
| 559 |
Embed a protein sequence using the specified ESM model.
|
|
|
|
| 596 |
assert len(seq_list) == len(id_list), "Sequence and ID lists must be the same length."
|
| 597 |
os.makedirs(path, exist_ok=True)
|
| 598 |
|
| 599 |
+
for i, (seq, acc) in enumerate(
|
| 600 |
+
tqdm(zip(seq_list, id_list),
|
| 601 |
+
total=len(seq_list), desc="Saving embeddings")):
|
| 602 |
try:
|
| 603 |
output: LogitsOutput = esm_embed_sequence(model=model, sequence=seq, device = device)
|
| 604 |
emb_array = output.embeddings.cpu().numpy()
|
|
|
|
| 617 |
gc.collect()
|
| 618 |
torch.cuda.empty_cache()
|
| 619 |
|
| 620 |
+
def prost_embed_sequence(seq : str,
|
| 621 |
+
acc : str,
|
| 622 |
+
tokenizer : T5Tokenizer,
|
| 623 |
+
model : PreTrainedModel,
|
| 624 |
+
device : torch.device = torch.device(
|
| 625 |
+
'cuda:0'
|
| 626 |
+
if torch.cuda.is_available()
|
| 627 |
+
else 'cpu'
|
| 628 |
+
))-> Optional[np.ndarray]:
|
| 629 |
+
|
| 630 |
+
"""
|
| 631 |
+
Embeds a protein sequence using the ProstT5 model and returns the averaged embedding as a NumPy array.
|
| 632 |
+
Args:
|
| 633 |
+
seq (str): The amino acid sequence to embed. Non-standard amino acids (U, Z, O, B) are replaced with 'X'.
|
| 634 |
+
acc (str): Accession or identifier for the sequence, used for logging.
|
| 635 |
+
device (torch.device, optional): The device to run the model on. Defaults to CUDA if available, otherwise CPU.
|
| 636 |
+
Returns:
|
| 637 |
+
Optional[np.ndarray]: The averaged embedding vector for the input sequence, or None if an error occurs.
|
| 638 |
+
Notes:
|
| 639 |
+
- Uses half-precision on GPU and full precision on CPU for efficiency.
|
| 640 |
+
- Returns None and prints an error message if the sequence is too short or if a runtime/value error occurs.
|
| 641 |
+
"""
|
| 642 |
+
|
| 643 |
+
model = model.to(device) #type: ignore
|
| 644 |
+
model = model.half() if str(device) != 'cpu' else model.float() # Use half precision for GPU, full precision for CPU
|
| 645 |
|
| 646 |
+
seq = re.sub(r"[UZOB]", "X", seq) # Replace non-standard amino acids with 'X'
|
| 647 |
+
seq = " ".join(list(seq)) # Space-separate amino acids for ProstT5
|
| 648 |
+
|
| 649 |
+
try:
|
| 650 |
+
#Tokenize the sequence
|
| 651 |
+
ids = tokenizer(seq, add_special_tokens=True, return_tensors='pt')
|
| 652 |
|
| 653 |
+
# Move tensors to device after tokenization
|
| 654 |
+
ids = {k: v.to(device) for k, v in ids.items()}
|
| 655 |
|
| 656 |
+
with torch.no_grad():
|
| 657 |
+
# Forward pass through the model
|
| 658 |
+
with torch.no_grad():
|
| 659 |
+
embedding_repr = model(
|
| 660 |
+
ids['input_ids'],
|
| 661 |
+
attention_mask=ids['attention_mask']
|
| 662 |
+
)
|
| 663 |
+
|
| 664 |
+
real_len = ids['attention_mask'][0].sum().item() - 1 # Exclude start token
|
| 665 |
+
|
| 666 |
+
if real_len <= 0:
|
| 667 |
+
print(f"Sequence too short after tokenization for {acc}")
|
| 668 |
+
|
| 669 |
+
# Extract and average embeddings
|
| 670 |
+
|
| 671 |
+
emb = embedding_repr.last_hidden_state[0, 1:real_len]
|
| 672 |
+
emb_avg = emb.mean(dim=0).cpu().numpy()
|
| 673 |
+
|
| 674 |
+
return emb_avg
|
| 675 |
+
|
| 676 |
+
except RuntimeError as e:
|
| 677 |
+
print(f"RuntimeError while processing {acc}: {e}")
|
| 678 |
+
return None
|
| 679 |
+
except ValueError as e:
|
| 680 |
+
print(f"ValueError while processing {acc}: {e}")
|
| 681 |
+
return None
|
| 682 |
+
|
| 683 |
+
def fasta_to_seq(fasta_file: str) -> Optional[tuple[list[str], list[str]]]:
|
| 684 |
+
"""
|
| 685 |
+
Reads a FASTA file and extracts the sequences as a list of strings.
|
| 686 |
Args:
|
| 687 |
+
fasta_file (str): Path to the FASTA file to be read.
|
| 688 |
+
Returns:
|
| 689 |
+
list[str]: A list containing the sequences from the FASTA file as strings.
|
| 690 |
+
Returns an empty list if there is an error reading the file.
|
| 691 |
+
Raises:
|
| 692 |
+
ValueError: If the file cannot be parsed as FASTA.
|
| 693 |
+
"""
|
| 694 |
+
|
| 695 |
+
sequences = []
|
| 696 |
+
ids = []
|
| 697 |
+
|
| 698 |
+
with open(fasta_file, 'r', encoding='utf-8') as f:
|
| 699 |
+
|
| 700 |
+
try:
|
| 701 |
+
|
| 702 |
+
for record in SeqIO.parse(f, "fasta"):
|
| 703 |
+
sequences.append(str(record.seq))
|
| 704 |
+
ids.append(str(record.id))
|
| 705 |
+
return sequences, ids
|
| 706 |
+
|
| 707 |
+
except ValueError as e:
|
| 708 |
+
print(f"Error reading {fasta_file}: {e}")
|
| 709 |
+
return None
|
| 710 |
+
|
| 711 |
+
def save_predictions_to_txt(predictions_dict: dict[str, tuple[list[str], list[float]]],
|
| 712 |
+
output_file: str) -> None:
|
| 713 |
+
"""
|
| 714 |
+
Save predictions to a text file in the specified format.
|
| 715 |
|
| 716 |
+
Args:
|
| 717 |
+
predictions_dict: Dictionary with sequence_id as key and (class_names, probabilities) as value
|
| 718 |
+
output_file: Path to the output text file
|
| 719 |
"""
|
| 720 |
+
with open(output_file, 'w', encoding='utf-8') as f:
|
| 721 |
+
f.write("Sequence_ID,Predictions\n") # Header
|
| 722 |
+
|
| 723 |
+
for seq_id, (class_names, probabilities) in predictions_dict.items():
|
| 724 |
|
| 725 |
+
# Create pairs of (class_name, probability) and sort by probability (descending)
|
| 726 |
+
class_prob_pairs = list(zip(class_names, probabilities))
|
| 727 |
+
class_prob_pairs.sort(key=lambda x: x[1], reverse=True)
|
| 728 |
|
| 729 |
+
# Create the prediction string: "Class1 (prob1), Class2 (prob2), ..."
|
| 730 |
+
pred_strings = [f"{cls} ({prob:.4f})" for cls, prob in zip(class_names, probabilities)]
|
| 731 |
+
pred_line = ", ".join(pred_strings)
|
| 732 |
+
|
| 733 |
+
f.write(f"{seq_id},{pred_line}\n")
|
| 734 |
|
| 735 |
+
def predict_with_prost():
|
| 736 |
+
"""
|
| 737 |
+
Function to select a directory containing FASTA files and embed sequences using ProstT5.
|
| 738 |
+
"""
|
| 739 |
+
root = tk.Tk()
|
| 740 |
+
root.withdraw()
|
| 741 |
|
| 742 |
+
fasta_path : str = filedialog.askopenfilename(
|
| 743 |
+
title="Select a FASTA file",
|
| 744 |
+
filetypes=[("FASTA files", "*.fasta *.fa")],
|
| 745 |
+
initialdir="."
|
| 746 |
+
)
|
| 747 |
|
| 748 |
+
if not fasta_path:
|
| 749 |
+
print("No file selected.")
|
| 750 |
+
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 751 |
|
| 752 |
+
# Select output directory for results
|
| 753 |
+
output_dir: str = filedialog.askdirectory(
|
| 754 |
+
title="Select output directory for results",
|
| 755 |
+
initialdir="."
|
| 756 |
+
)
|
| 757 |
|
| 758 |
+
if not output_dir:
|
| 759 |
+
print("No output directory selected.")
|
| 760 |
+
return
|
| 761 |
|
| 762 |
+
result = fasta_to_seq(fasta_path)
|
| 763 |
|
| 764 |
+
if result is None:
|
| 765 |
+
print("No sequences found in the FASTA file.")
|
| 766 |
+
return {}
|
| 767 |
+
else:
|
| 768 |
+
sequences, ids = result
|
| 769 |
+
print(f"Sequences loaded from {fasta_path}: {len(sequences)} sequences found.")
|
| 770 |
+
print("Embedding sequences using ProstT5...")
|
| 771 |
|
| 772 |
+
tokenizer : T5Tokenizer = T5Tokenizer.from_pretrained("Rostlab/ProstT5", do_lower_case=False)
|
| 773 |
+
model : PreTrainedModel = T5EncoderModel.from_pretrained("Rostlab/ProstT5")
|
| 774 |
|
| 775 |
+
embeddings : dict[str, np.ndarray] = {}
|
| 776 |
+
|
| 777 |
+
for seq, acc in tqdm(zip(sequences, ids), total=len(sequences), desc="Embedding sequences"):
|
| 778 |
+
emb = prost_embed_sequence(seq, acc, tokenizer, model)
|
| 779 |
+
if emb is not None:
|
| 780 |
+
embeddings[acc] = emb
|
| 781 |
+
else:
|
| 782 |
+
print(f"Failed to embed sequence {acc}. Skipping.")
|
| 783 |
+
|
| 784 |
+
print(f"Embedded {len(embeddings)} sequences successfully.")
|
| 785 |
|
| 786 |
+
print("Loading pre-trained SVM model for prediction...")
|
| 787 |
+
try:
|
| 788 |
+
predictor = load('/home/juan/ProteinLocationPredictor/ProteinLocationPredictor/Models/rfProst.joblib')
|
| 789 |
+
except FileNotFoundError:
|
| 790 |
+
print("Error: Could not find the model file '../ProteinLocationPredictor/Models/svmProst.joblib'")
|
| 791 |
+
print("Please check the path to your trained model.")
|
| 792 |
+
return
|
| 793 |
+
|
| 794 |
+
sequence_ids = list(embeddings.keys())
|
| 795 |
+
X = np.array(list(embeddings.values())) #type: ignore
|
| 796 |
+
print("Making predictions...")
|
| 797 |
+
y_pred_proba = predictor.predict_proba(X)
|
| 798 |
+
|
| 799 |
+
# Get class names (you may need to adjust this based on your model)
|
| 800 |
+
if hasattr(predictor, 'classes_'):
|
| 801 |
+
class_names = predictor.classes_.tolist()
|
| 802 |
+
else:
|
| 803 |
+
# If class names are not available, use generic names
|
| 804 |
+
n_classes = y_pred_proba.shape[1]
|
| 805 |
+
class_names = [f"Class_{i}" for i in range(n_classes)]
|
| 806 |
+
|
| 807 |
+
# Convert class names to strings if they aren't already
|
| 808 |
+
class_names = [str(cls) for cls in class_names]
|
| 809 |
+
|
| 810 |
+
# Create predictions dictionary
|
| 811 |
+
predictions_dict = {}
|
| 812 |
+
for i, seq_id in enumerate(sequence_ids):
|
| 813 |
+
probabilities = y_pred_proba[i].tolist()
|
| 814 |
+
class_prob_pairs = sorted(zip(class_names, probabilities), key=lambda x: x[1], reverse=True)
|
| 815 |
+
sorted_classes, sorted_probs = zip(*class_prob_pairs)
|
| 816 |
+
predictions_dict[seq_id] = (list(sorted_classes), list(sorted_probs))
|
| 817 |
+
|
| 818 |
+
# Generate output filename
|
| 819 |
+
input_filename = os.path.splitext(os.path.basename(fasta_path))[0]
|
| 820 |
+
output_file = os.path.join(output_dir, f"{input_filename}_predictions.txt")
|
| 821 |
+
|
| 822 |
+
# Save predictions to file
|
| 823 |
+
print(f"Saving predictions to {output_file}...")
|
| 824 |
+
save_predictions_to_txt(predictions_dict, output_file)
|
| 825 |
+
|
| 826 |
+
print(f"Predictions saved successfully!")
|
| 827 |
+
print(f"Total sequences processed: {len(embeddings)}")
|
| 828 |
+
print(f"Output file: {output_file}")
|
| 829 |
+
|
| 830 |
+
# Print a few sample predictions
|
| 831 |
+
print("\nSample predictions:")
|
| 832 |
+
for i, (seq_id, (classes, probs)) in enumerate(list(predictions_dict.items())[:3]):
|
| 833 |
+
pred_str = ", ".join([f"{cls} ({prob:.4f})" for cls, prob in zip(classes, probs)])
|
| 834 |
+
print(f"{seq_id}: {pred_str}")
|