ynuozhang
commited on
Commit
·
a98d518
1
Parent(s):
cdf1251
remove unnecessary
Browse files- .gitattributes +8 -1
- README.md +449 -3
- functions/binding.py +0 -186
- functions/hemolysis.py +0 -69
- functions/nonfouling.py +0 -69
- functions/permeability.py +0 -167
- functions/solubility.py +0 -68
- functions/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0
- functions/tokenizer/my_tokenizers.py +0 -398
- functions/tokenizer/new_splits.txt +0 -159
- functions/tokenizer/new_vocab.txt +0 -586
- load.py → inference.py +85 -33
- models/best_model_half_life.pth +0 -3
- models/best_model_hemolysis.json +0 -3
- models/best_model_nonfouling.json +0 -3
- models/best_model_solubility.json +0 -3
- models/binding_affinity_smiles.pt +0 -3
- models/binding_affinity_unpooled.pt +0 -3
- models/enhancer_class.ckpt +0 -3
- models/enhancer_class_hparams.yaml +0 -3
- models/hemolysis-xgboost_smiles.json +0 -3
- models/nonfouling-xgboost_smiles.json +0 -3
- models/permeability-xgboost_smiles.json +0 -3
- models/solubility-xgboost_smiles.json +0 -3
- scoring_functions.py +0 -103
.gitattributes
CHANGED
|
@@ -85,7 +85,14 @@ metrics/solubility/train_classification_plot.png filter=lfs diff=lfs merge=lfs -
|
|
| 85 |
metrics filter=lfs diff=lfs merge=lfs -text
|
| 86 |
models filter=lfs diff=lfs merge=lfs -text
|
| 87 |
training_data filter=lfs diff=lfs merge=lfs -text
|
| 88 |
-
README.md filter=lfs diff=lfs merge=lfs -text
|
| 89 |
embeddings filter=lfs diff=lfs merge=lfs -text
|
| 90 |
models/binding_affinity_for_smiles.pt filter=lfs diff=lfs merge=lfs -text
|
| 91 |
*.csv filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 85 |
metrics filter=lfs diff=lfs merge=lfs -text
|
| 86 |
models filter=lfs diff=lfs merge=lfs -text
|
| 87 |
training_data filter=lfs diff=lfs merge=lfs -text
|
|
|
|
| 88 |
embeddings filter=lfs diff=lfs merge=lfs -text
|
| 89 |
models/binding_affinity_for_smiles.pt filter=lfs diff=lfs merge=lfs -text
|
| 90 |
*.csv filter=lfs diff=lfs merge=lfs -text
|
| 91 |
+
training_classifiers/half_life/xgb_wt_log/best_model.json filter=lfs diff=lfs merge=lfs -text
|
| 92 |
+
training_classifiers/half_life/xgb_wt_raw/best_model.json filter=lfs diff=lfs merge=lfs -text
|
| 93 |
+
training_data_cleaned/toxicity/*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 94 |
+
training_data_cleaned/toxicity/tox_smiles_with_embeddings/*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 95 |
+
training_data_cleaned/toxicity/tox_smiles_with_embeddings/train/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text
|
| 96 |
+
training_data_cleaned/toxicity/tox_smiles_with_embeddings_unpooled/train/data-00000-of-00005.arrow filter=lfs diff=lfs merge=lfs -text
|
| 97 |
+
training_data_cleaned/toxicity/tox_smiles_with_embeddings_unpooled/val/*.arrow filter=lfs diff=lfs merge=lfs -text
|
| 98 |
+
training_data_cleaned/toxicity/tox_smiles_with_embeddings/val/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text
|
README.md
CHANGED
|
@@ -1,3 +1,449 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: cc-by-nc-nd-4.0
|
| 3 |
+
---
|
| 4 |
+
|
| 5 |
+

|
| 6 |
+
|
| 7 |
+
# PeptiVerse 🧬🌌
|
| 8 |
+
|
| 9 |
+
A collection of machine learning predictors for canonical and non-canonical peptide property prediction using sequence and SMILES representations. 🧬 PeptiVerse 🌌 enables evaluation of key biophysical and therapeutic properties of peptides for property-optimized generation.
|
| 10 |
+
|
| 11 |
+
## Table of Contents
|
| 12 |
+
|
| 13 |
+
- [Quick start](#quick-start-)
|
| 14 |
+
- [Installation](#installation-)
|
| 15 |
+
- [Repository Structure](#repository-structure-)
|
| 16 |
+
- [Training data collection](#training-data-collection-)
|
| 17 |
+
- [Best model list](#best-model-list-)
|
| 18 |
+
- [Full model set (cuML-enabled)](#full-model-set-gpu-enabled)
|
| 19 |
+
- [Minimal deployable model set (no cuML)](#minimal-deployable-set)
|
| 20 |
+
- [Usage](#usage-)
|
| 21 |
+
- [Local Application Hosting](#local-application-hosting)
|
| 22 |
+
- [Dataset integration](#dataset-integration)
|
| 23 |
+
- [Quick inference by property per model](#Quick-inference-by-property-per-model)
|
| 24 |
+
- [Property Interpretations](#property-interpretations-)
|
| 25 |
+
- [Model Architecture](#model-architecture-)
|
| 26 |
+
- [Troubleshooting](#troubleshooting-)
|
| 27 |
+
- [Citation](#citation-)
|
| 28 |
+
|
| 29 |
+
## Quick start 🌟
|
| 30 |
+
|
| 31 |
+
```bash
|
| 32 |
+
# Clone repository
|
| 33 |
+
git clone https://huggingface.co/ChatterjeeLab/PeptiVerse
|
| 34 |
+
|
| 35 |
+
# Install dependencies
|
| 36 |
+
pip install -r requirements.txt
|
| 37 |
+
|
| 38 |
+
# Run inference
|
| 39 |
+
python inference.py
|
| 40 |
+
```
|
| 41 |
+
## Installation 🌟
|
| 42 |
+
### Minimal Setup 🚀
|
| 43 |
+
- Easy start-up environment (using transformers, xgboost models)
|
| 44 |
+
```bash
|
| 45 |
+
pip install -r requirements.txt
|
| 46 |
+
```
|
| 47 |
+
### Full Setup 🚀
|
| 48 |
+
- Additional access to trained SVM and ElastNet models requires installation of `RAPIDS cuML`, with instructions available from their official [github page](https://github.com/rapidsai/cuml) (**CUDA-capable GPU required**).
|
| 49 |
+
- Optional: pre-compiled Singularity/Apptainer environment (7.52G) is available at [Google drive](https://drive.google.com/file/d/1RJQ9HK0_gsPOhRo5H5ZmH_MYcpJqQD7e/view?usp=sharing) with everything you need (still need CUDA/GPU to load cuML models).
|
| 50 |
+
```
|
| 51 |
+
# test
|
| 52 |
+
apptainer exec peptiverse.sif python -c "import sys; print(sys.executable)"
|
| 53 |
+
|
| 54 |
+
# run inference (see below)
|
| 55 |
+
apptainer exec peptiverse.sif python inference.py
|
| 56 |
+
```
|
| 57 |
+
## Repository structure 🌟
|
| 58 |
+
This repo contains important large files for [PeptiVerse](https://huggingface.co/spaces/ChatterjeeLab/PeptiVerse), an interactive app for peptide property prediction. [Paper link.](https://www.biorxiv.org/content/10.64898/2025.12.31.697180v1)
|
| 59 |
+
|
| 60 |
+
```
|
| 61 |
+
PeptiVerse/
|
| 62 |
+
├── training_data_cleaned/ # Processed datasets with embeddings
|
| 63 |
+
│ └── <property>/ # Property-specific data
|
| 64 |
+
│ ├── train/val splits
|
| 65 |
+
│ └── precomputed embeddings
|
| 66 |
+
├── training_classifiers/ # Trained model weights
|
| 67 |
+
│ └── <property>/
|
| 68 |
+
│ ├── cnn_wt/ # CNN architectures
|
| 69 |
+
│ ├── mlp_wt/ # MLP architectures
|
| 70 |
+
│ └── xgb_wt/ # XGBoost models
|
| 71 |
+
├── tokenizer/ # PeptideCLM tokenizer
|
| 72 |
+
├── training_data/ # Raw training data
|
| 73 |
+
├── inference.py # Main prediction interface
|
| 74 |
+
├── best_models.txt # Model selection manifest
|
| 75 |
+
└── requirements.txt # Python dependencies
|
| 76 |
+
```
|
| 77 |
+
|
| 78 |
+
## Training Data Collection 🌟
|
| 79 |
+
|
| 80 |
+
<table>
|
| 81 |
+
<caption><strong>Data distribution.</strong> Classification tasks report counts for class 0/1; regression tasks report total sample size (N).</caption>
|
| 82 |
+
<thead>
|
| 83 |
+
<tr>
|
| 84 |
+
<th rowspan="2"><strong>Properties</strong></th>
|
| 85 |
+
<th colspan="2"><strong>Amino Acid Sequences</strong></th>
|
| 86 |
+
<th colspan="2"><strong>SMILES Sequences</strong></th>
|
| 87 |
+
</tr>
|
| 88 |
+
<tr>
|
| 89 |
+
<th><strong>0</strong></th>
|
| 90 |
+
<th><strong>1</strong></th>
|
| 91 |
+
<th><strong>0</strong></th>
|
| 92 |
+
<th><strong>1</strong></th>
|
| 93 |
+
</tr>
|
| 94 |
+
</thead>
|
| 95 |
+
<tbody>
|
| 96 |
+
<tr>
|
| 97 |
+
<td colspan="5"><strong>Classification</strong></td>
|
| 98 |
+
</tr>
|
| 99 |
+
<tr>
|
| 100 |
+
<td>Hemolysis</td>
|
| 101 |
+
<td>4765</td>
|
| 102 |
+
<td>1311</td>
|
| 103 |
+
<td>4765</td>
|
| 104 |
+
<td>1311</td>
|
| 105 |
+
</tr>
|
| 106 |
+
<tr>
|
| 107 |
+
<td>Non-Fouling</td>
|
| 108 |
+
<td>13580</td>
|
| 109 |
+
<td>3600</td>
|
| 110 |
+
<td>13580</td>
|
| 111 |
+
<td>3600</td>
|
| 112 |
+
</tr>
|
| 113 |
+
<tr>
|
| 114 |
+
<td>Solubility</td>
|
| 115 |
+
<td>9668</td>
|
| 116 |
+
<td>8785</td>
|
| 117 |
+
<td>-</td>
|
| 118 |
+
<td>-</td>
|
| 119 |
+
</tr>
|
| 120 |
+
<tr>
|
| 121 |
+
<td>Permeability (Penetrance)</td>
|
| 122 |
+
<td>1162</td>
|
| 123 |
+
<td>1162</td>
|
| 124 |
+
<td>-</td>
|
| 125 |
+
<td>-</td>
|
| 126 |
+
</tr>
|
| 127 |
+
<tr>
|
| 128 |
+
<td>Toxicity</td>
|
| 129 |
+
<td>-</td>
|
| 130 |
+
<td>-</td>
|
| 131 |
+
<td>5518</td>
|
| 132 |
+
<td>5518</td>
|
| 133 |
+
</tr>
|
| 134 |
+
<tr>
|
| 135 |
+
<td colspan="5"><strong>Regression (N)</strong></td>
|
| 136 |
+
</tr>
|
| 137 |
+
<tr>
|
| 138 |
+
<td>Permeability (PAMPA)</td>
|
| 139 |
+
<td colspan="2" align="center">-</td>
|
| 140 |
+
<td colspan="2" align="center">6869</td>
|
| 141 |
+
</tr>
|
| 142 |
+
<tr>
|
| 143 |
+
<td>Permeability (CACO2)</td>
|
| 144 |
+
<td colspan="2" align="center">-</td>
|
| 145 |
+
<td colspan="2" align="center">606</td>
|
| 146 |
+
</tr>
|
| 147 |
+
<tr>
|
| 148 |
+
<td>Half-Life</td>
|
| 149 |
+
<td colspan="2" align="center">130</td>
|
| 150 |
+
<td colspan="2" align="center">245</td>
|
| 151 |
+
</tr>
|
| 152 |
+
<tr>
|
| 153 |
+
<td>Binding Affinity</td>
|
| 154 |
+
<td colspan="2" align="center">1436</td>
|
| 155 |
+
<td colspan="2" align="center">1597</td>
|
| 156 |
+
</tr>
|
| 157 |
+
</tbody>
|
| 158 |
+
</table>
|
| 159 |
+
|
| 160 |
+
|
| 161 |
+
## Best Model List 🌟
|
| 162 |
+
|
| 163 |
+
### Full model set (cuML-enabled)
|
| 164 |
+
| Property | Best Model (Sequence) | Best Model (SMILES) | Task Type | Threshold (Sequence) | Threshold (SMILES) |
|
| 165 |
+
|----------------------------|-----------------|---------------------|-------------|----------------|--------------------|
|
| 166 |
+
| Hemolysis | SVM | Transformer | Classifier | 0.2521 | 0.4343 |
|
| 167 |
+
| Non-Fouling | MLP | ENET | Classifier | 0.57 | 0.6969 |
|
| 168 |
+
| Solubility | CNN | – | Classifier | 0.377 | – |
|
| 169 |
+
| Permeability (Penetrance) | SVM | – | Classifier | 0.5493 | – |
|
| 170 |
+
| Toxicity | – | Transformer | Classifier | – | 0.3401 |
|
| 171 |
+
| Binding Affinity | unpooled | unpooled | Regression | – | – |
|
| 172 |
+
| Permeability (PAMPA) | – | CNN | Regression | – | – |
|
| 173 |
+
| Permeability (Caco-2) | – | SVR | Regression | – | – |
|
| 174 |
+
| Half-life | Transformer | XGB | Regression | – | – |
|
| 175 |
+
>Note: *unpooled* indicates models operating on token-level embeddings with cross-attention, rather than mean-pooled representations.
|
| 176 |
+
|
| 177 |
+
### Minimal deployable model set (no cuML)
|
| 178 |
+
| Property | Best Model (WT) | Best Model (SMILES) | Task Type | Threshold (WT) | Threshold (SMILES) |
|
| 179 |
+
|----------------------------|-----------------|---------------------|-------------|----------------|--------------------|
|
| 180 |
+
| Hemolysis | XGB | Transformer | Classifier | 0.2801 | 0.4343 |
|
| 181 |
+
| Non-Fouling | MLP | XGB | Classifier | 0.57 | 0.3982 |
|
| 182 |
+
| Solubility | CNN | – | Classifier | 0.377 | – |
|
| 183 |
+
| Permeability (Penetrance) | XGB | – | Classifier | 0.4301 | – |
|
| 184 |
+
| Toxicity | – | Transformer | Classifier | – | 0.3401 |
|
| 185 |
+
| Binding Affinity | unpooled | unpooled | Regression | – | – |
|
| 186 |
+
| Permeability (PAMPA) | – | CNN | Regression | – | – |
|
| 187 |
+
| Permeability (Caco-2) | – | SVR | Regression | – | – |
|
| 188 |
+
| Half-life | xgb_wt_log | xgb_smiles | Regression | – | – |
|
| 189 |
+
|
| 190 |
+
>Note: Models marked as SVM or ENET are replaced with XGB as these models are not currently supported in the deployment environment without cuML setups. *xgb_wt_log* indicated log-scaled transformation of time during training.
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
## Usage 🌟
|
| 194 |
+
|
| 195 |
+
### Local Application Hosting
|
| 196 |
+
- Host the [PeptiVerse UI](https://huggingface.co/spaces/ChatterjeeLab/PeptiVerse) locally with your own resources.
|
| 197 |
+
```bash
|
| 198 |
+
# Configure models in best_models.txt
|
| 199 |
+
|
| 200 |
+
git clone https://huggingface.co/spaces/ChatterjeeLab/PeptiVerse
|
| 201 |
+
python app.py
|
| 202 |
+
```
|
| 203 |
+
### Dataset integration
|
| 204 |
+
- All properties are provided with raw_data/split_ready_csvs/[huggingface_datasets](https://huggingface.co/docs/datasets/en/index).
|
| 205 |
+
- Selective download the data you need with `huggingface-cli`
|
| 206 |
+
```bash
|
| 207 |
+
huggingface-cli download ChatterjeeLab/PeptiVerse \
|
| 208 |
+
--include "training_data_cleaned/**" \ # only this folder
|
| 209 |
+
--exclude "**/*.pt" "**/*.joblib" \ # skip weights/artifacts
|
| 210 |
+
--local-dir PeptiVerse_partial \
|
| 211 |
+
--local-dir-use-symlinks False # make real copies
|
| 212 |
+
```
|
| 213 |
+
- Or in python
|
| 214 |
+
```python
|
| 215 |
+
from huggingface_hub import snapshot_download
|
| 216 |
+
|
| 217 |
+
local_dir = snapshot_download(
|
| 218 |
+
repo_id="ChatterjeeLab/PeptiVerse",
|
| 219 |
+
allow_patterns=["training_data_cleaned/**"], # only this folder
|
| 220 |
+
ignore_patterns=["**/*.pt", "**/*.joblib"], # skip weights/artifacts
|
| 221 |
+
local_dir="PeptiVerse_partial",
|
| 222 |
+
local_dir_use_symlinks=False, # make real copies
|
| 223 |
+
)
|
| 224 |
+
print("Downloaded to:", local_dir)
|
| 225 |
+
```
|
| 226 |
+
- Usage of the huggingface datasets (with pre-computed embeddings and splits)
|
| 227 |
+
- All embedding datasets are saved via `DatasetDict.save_to_disk` and loadable with:
|
| 228 |
+
``` python
|
| 229 |
+
from datasets import load_from_disk
|
| 230 |
+
ds = load_from_disk(PATH)
|
| 231 |
+
train_ds = ds["train"]
|
| 232 |
+
val_ds = ds["val"]
|
| 233 |
+
```
|
| 234 |
+
- A) Sequence Based ([ESM-2](https://huggingface.co/facebook/esm2_t33_650M_UR50D) embeddings)
|
| 235 |
+
- Pooled (fixed-length vector per sequence)
|
| 236 |
+
- Generated by mean-pooling token embeddings excluding special tokens (CLS/EOS) and padding.
|
| 237 |
+
- Each item:
|
| 238 |
+
sequence: `str`
|
| 239 |
+
label: `int` (classification) or `float` (regression)
|
| 240 |
+
embedding: `float32[H]` (H=1280 for ESM-2 650M)
|
| 241 |
+
- Unpooled (variable-length token matrix)
|
| 242 |
+
- Generated by keeping all valid token embeddings (excluding special tokens + padding) as a per-sequence matrix.
|
| 243 |
+
- Each item:
|
| 244 |
+
sequence: `str`
|
| 245 |
+
label: `int` (classification) or `float` (regression)
|
| 246 |
+
embedding: `float16[L, H]` (nested lists)
|
| 247 |
+
attention_mask: `int8[L]`
|
| 248 |
+
length: `int` (=L)
|
| 249 |
+
- B) SMILES-based ([PeptideCLM](https://github.com/AaronFeller/PeptideCLM) embeddings)
|
| 250 |
+
- Pooled (fixed-length vector per sequence)
|
| 251 |
+
- Generated by mean-pooling token embeddings excluding special tokens (CLS/EOS) and padding.
|
| 252 |
+
- Each item:
|
| 253 |
+
sequence: `str` (SMILES)
|
| 254 |
+
label: `int` (classification) or `float` (regression)
|
| 255 |
+
embedding: `float32[H]`
|
| 256 |
+
- Unpooled (variable-length token matrix)
|
| 257 |
+
- Generated by keeping all valid token embeddings (excluding special tokens + padding) as a per-sequence matrix.
|
| 258 |
+
- Each item:
|
| 259 |
+
sequence: `str` (SMILES)
|
| 260 |
+
label: `int` (classification) or `float` (regression)
|
| 261 |
+
embedding: `float16[L, H]` (nested lists)
|
| 262 |
+
attention_mask: `int8[L]`
|
| 263 |
+
length: `int` (=L)
|
| 264 |
+
|
| 265 |
+
|
| 266 |
+
### Quick inference by property per model
|
| 267 |
+
```python
|
| 268 |
+
from inference import PeptiVersePredictor
|
| 269 |
+
|
| 270 |
+
pred = PeptiVersePredictor(
|
| 271 |
+
manifest_path="best_models.txt", # best model list
|
| 272 |
+
classifier_weight_root=".", # repo root (where training_classifiers/ lives)
|
| 273 |
+
device="cuda", # or "cpu"
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
# mode: smiles (SMILES-based models) / wt (Sequence-based models)
|
| 277 |
+
# property keys (with some level of name normalization)
|
| 278 |
+
# hemolysis
|
| 279 |
+
# nf (Non-Fouling)
|
| 280 |
+
# solubility
|
| 281 |
+
# permeability_penetrance
|
| 282 |
+
# toxicity
|
| 283 |
+
# permeability_pampa
|
| 284 |
+
# permeability_caco2
|
| 285 |
+
# halflife
|
| 286 |
+
# binding_affinity
|
| 287 |
+
|
| 288 |
+
seq = "GIVEQCCTSICSLYQLENYCN"
|
| 289 |
+
smiles = "CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@@H](C)N(C)C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]2CCCN2C1=O"
|
| 290 |
+
|
| 291 |
+
# Hemolysis
|
| 292 |
+
out = pred.predict_property("hemolysis", mode="wt", input_str=seq)
|
| 293 |
+
print(out)
|
| 294 |
+
# {"property":"hemolysis","mode":"wt","score":prob,"label":0/1,"threshold":...}
|
| 295 |
+
|
| 296 |
+
out = pred.predict_property("hemolysis", mode="smiles", input_str=smiles)
|
| 297 |
+
print(out)
|
| 298 |
+
|
| 299 |
+
# Non-fouling (key is nf)
|
| 300 |
+
out = pred.predict_property("nf", mode="wt", input_str=seq)
|
| 301 |
+
print(out)
|
| 302 |
+
|
| 303 |
+
out = pred.predict_property("nf", mode="smiles", input_str=smiles)
|
| 304 |
+
print(out)
|
| 305 |
+
|
| 306 |
+
# Solubility (Sequence-only)
|
| 307 |
+
out = pred.predict_property("solubility", mode="wt", input_str=seq)
|
| 308 |
+
print(out)
|
| 309 |
+
|
| 310 |
+
# Permeability (Penetrance) (Sequence-only)
|
| 311 |
+
out = pred.predict_property("permeability_penetrance", mode="wt", input_str=seq)
|
| 312 |
+
print(out)
|
| 313 |
+
|
| 314 |
+
# Toxicity (SMILES-only)
|
| 315 |
+
out = pred.predict_property("toxicity", mode="smiles", input_str=smiles)
|
| 316 |
+
print(out)
|
| 317 |
+
|
| 318 |
+
# Permeability (PAMPA) (SMILES regression)
|
| 319 |
+
out = pred.predict_property("permeability_pampa", mode="smiles", input_str=smiles)
|
| 320 |
+
print(out)
|
| 321 |
+
# {"property":"permeability_pampa","mode":"smiles","score":value}
|
| 322 |
+
|
| 323 |
+
# Permeability (Caco-2) (SMILES regression)
|
| 324 |
+
out = pred.predict_property("permeability_caco2", mode="smiles", input_str=smiles)
|
| 325 |
+
print(out)
|
| 326 |
+
|
| 327 |
+
# Half-life (sequence-based + SMILES regression)
|
| 328 |
+
out = pred.predict_property("halflife", mode="wt", input_str=seq)
|
| 329 |
+
print(out)
|
| 330 |
+
|
| 331 |
+
out = pred.predict_property("halflife", mode="smiles", input_str=smiles)
|
| 332 |
+
print(out)
|
| 333 |
+
|
| 334 |
+
# Binding Affinity
|
| 335 |
+
protein = "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQV..." # target protein
|
| 336 |
+
peptide_seq = "GIVEQCCTSICSLYQLENYCN"
|
| 337 |
+
|
| 338 |
+
out = pred.predict_binding_affinity(
|
| 339 |
+
mode="wt",
|
| 340 |
+
target_seq=protein,
|
| 341 |
+
binder_str=peptide_seq,
|
| 342 |
+
)
|
| 343 |
+
print(out)
|
| 344 |
+
# {
|
| 345 |
+
# "property":"binding_affinity",
|
| 346 |
+
# "mode":"wt",
|
| 347 |
+
# "affinity": float,
|
| 348 |
+
# "class_by_threshold": "High (≥9)" / "Moderate (7-9)" / "Low (<7)",
|
| 349 |
+
# "class_by_logits": same buckets,
|
| 350 |
+
# "binding_model": "pooled" or "unpooled",
|
| 351 |
+
# }
|
| 352 |
+
|
| 353 |
+
```
|
| 354 |
+
|
| 355 |
+
## Interpretation 🌟
|
| 356 |
+
|
| 357 |
+
You can also find the same description in the paper or in the PeptiVerse app `Documentation` tab.
|
| 358 |
+
|
| 359 |
+
---
|
| 360 |
+
#### 🩸 Hemolysis Prediction
|
| 361 |
+
50% of read blood cells being lysed at x ug/ml concetration (HC50). If HC50 < 100uM, considered as hemolytic, otherwise non-hemolytic, resulting in a binary 0/1 dataset. The predicted probability should therefore be interpreted as a risk indicator, not an exact concentration estimate.
|
| 362 |
+
**Output interpretation:**
|
| 363 |
+
- Score close to 1.0 = high probability of red blood cell membrane disruption
|
| 364 |
+
- Score close to 0.0 = non-hemolytic
|
| 365 |
+
---
|
| 366 |
+
|
| 367 |
+
#### 💧 Solubility Prediction
|
| 368 |
+
Outputs a probability (0–1) that a peptide remains soluble in aqueous conditions.
|
| 369 |
+
**Output interpretation:**
|
| 370 |
+
- Score close to 1.0 = highly soluble
|
| 371 |
+
- Score close to 0.0 = poorly soluble
|
| 372 |
+
|
| 373 |
+
---
|
| 374 |
+
|
| 375 |
+
#### 👯 Non-Fouling Prediction
|
| 376 |
+
Higher scores indicate stronger non-fouling behavior, desirable for circulation and surface-exposed applications.
|
| 377 |
+
**Output interpretation:**
|
| 378 |
+
- Score close to 1.0 = non-fouling
|
| 379 |
+
- Score close to 0.0 = fouling
|
| 380 |
+
---
|
| 381 |
+
|
| 382 |
+
#### 🪣 Permeability Prediction
|
| 383 |
+
Predicts membrane permeability on a log P scale.
|
| 384 |
+
**Output interpretation:**
|
| 385 |
+
- Higher values = more permeable (>-6.0)
|
| 386 |
+
- For penetrance predictions, it is a classification prediction, so within the [0, 1] range, closer to 1 indicates more permeable.
|
| 387 |
+
---
|
| 388 |
+
|
| 389 |
+
#### ⏱️ Half-Life Prediction
|
| 390 |
+
**Interpretation:** Predicted values reflect relative peptide stability for the unit in hours. Higher scores indicate longer persistence in serum, while lower scores suggest faster degradation.
|
| 391 |
+
|
| 392 |
+
---
|
| 393 |
+
|
| 394 |
+
#### ☠️ Toxicity Prediction
|
| 395 |
+
**Interpretation:** Outputs a probability (0–1) that a peptide exhibits toxic effects. Higher scores indicate increased toxicity risk.
|
| 396 |
+
|
| 397 |
+
---
|
| 398 |
+
|
| 399 |
+
#### 🔗 Binding Affinity Prediction
|
| 400 |
+
|
| 401 |
+
Predicts peptide-protein binding affinity. Requires both peptide and target protein sequence.
|
| 402 |
+
|
| 403 |
+
**Interpretation:**<br>
|
| 404 |
+
- Scores ≥ 9 correspond to tight binders (K ≤ 10⁻⁹ M, nanomolar to picomolar range)<br>
|
| 405 |
+
- Scores between 7 and 9 correspond to medium binders (10⁻⁷–10⁻⁹ M, nanomolar to micromolar range)<br>
|
| 406 |
+
- Scores < 7 correspond to weak binders (K ≥ 10⁻⁶ M, micromolar and weaker)<br>
|
| 407 |
+
- A difference of 1 unit in score corresponds to an approximately tenfold change in binding affinity.<br>
|
| 408 |
+
|
| 409 |
+
|
| 410 |
+
## Model Architecture 🌟
|
| 411 |
+
|
| 412 |
+
- **Sequence Embeddings:** [ESM-2 650M model](https://huggingface.co/facebook/esm2_t33_650M_UR50D) / [PeptideCLM model](https://huggingface.co/aaronfeller/PeptideCLM-23M-all). Foundational embeddings are frozen.
|
| 413 |
+
- **XGBoost Model:** Gradient boosting on pooled embedding features for efficient, high-performance prediction.
|
| 414 |
+
- **CNN/Transformer Model:** One-dimensional convolutional/self-attention transformer networks operating on unpooled embeddings to capture local sequence patterns.
|
| 415 |
+
- **Binding Model:** Transformer-based architecture with cross-attention between protein and peptide representations.
|
| 416 |
+
- **SVR Model:** Support Vector Regression applied to pooled embeddings, providing a kernel-based, nonparametric regression baseline that is robust on smaller or noisy datasets.
|
| 417 |
+
- **Others:** SVM and Elastic Nets were trained with [RAPIDS cuML](https://github.com/rapidsai/cuml), which requires a CUDA environment and is therefore not supported in the web app. Model checkpoints remain available in the Hugging Face repository.
|
| 418 |
+
|
| 419 |
+
## Troubleshooting 🌟
|
| 420 |
+
|
| 421 |
+
### LFS Download Issues
|
| 422 |
+
|
| 423 |
+
If files appear as SHA pointers:
|
| 424 |
+
|
| 425 |
+
```bash
|
| 426 |
+
huggingface-cli download ChatterjeeLab/PeptiVerse \
|
| 427 |
+
training_data_cleaned/hemolysis/hemo_smiles_meta_with_split.csv \
|
| 428 |
+
--local-dir . \
|
| 429 |
+
--local-dir-use-symlinks False
|
| 430 |
+
```
|
| 431 |
+
|
| 432 |
+
## Citation 🌟
|
| 433 |
+
|
| 434 |
+
If you find this repository helpful for your publications, please consider citing our paper:
|
| 435 |
+
|
| 436 |
+
```
|
| 437 |
+
@article {Zhang2025.12.31.697180,
|
| 438 |
+
author = {Zhang, Yinuo and Tang, Sophia and Chen, Tong and Mahood, Elizabeth and Vincoff, Sophia and Chatterjee, Pranam},
|
| 439 |
+
title = {PeptiVerse: A Unified Platform for Therapeutic Peptide Property Prediction},
|
| 440 |
+
elocation-id = {2025.12.31.697180},
|
| 441 |
+
year = {2026},
|
| 442 |
+
doi = {10.64898/2025.12.31.697180},
|
| 443 |
+
publisher = {Cold Spring Harbor Laboratory},
|
| 444 |
+
URL = {https://www.biorxiv.org/content/early/2026/01/03/2025.12.31.697180},
|
| 445 |
+
eprint = {https://www.biorxiv.org/content/early/2026/01/03/2025.12.31.697180.full.pdf},
|
| 446 |
+
journal = {bioRxiv}
|
| 447 |
+
}
|
| 448 |
+
```
|
| 449 |
+
To use this repository, you agree to abide by the MIT License.
|
functions/binding.py
DELETED
|
@@ -1,186 +0,0 @@
|
|
| 1 |
-
|
| 2 |
-
import torch
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import torch.nn as nn
|
| 5 |
-
import esm
|
| 6 |
-
from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
|
| 7 |
-
from transformers import AutoModelForMaskedLM, AutoModelForCausalLM, AutoTokenizer, AutoModel
|
| 8 |
-
|
| 9 |
-
base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
|
| 10 |
-
|
| 11 |
-
class ImprovedBindingPredictor(nn.Module):
|
| 12 |
-
def __init__(self,
|
| 13 |
-
esm_dim=1280,
|
| 14 |
-
smiles_dim=768,
|
| 15 |
-
hidden_dim=512,
|
| 16 |
-
n_heads=8,
|
| 17 |
-
n_layers=3,
|
| 18 |
-
dropout=0.1):
|
| 19 |
-
super().__init__()
|
| 20 |
-
|
| 21 |
-
# Define binding thresholds
|
| 22 |
-
self.tight_threshold = 7.5 # Kd/Ki/IC50 ≤ ~30nM
|
| 23 |
-
self.weak_threshold = 6.0 # Kd/Ki/IC50 > 1μM
|
| 24 |
-
|
| 25 |
-
# Project to same dimension
|
| 26 |
-
self.smiles_projection = nn.Linear(smiles_dim, hidden_dim)
|
| 27 |
-
self.protein_projection = nn.Linear(esm_dim, hidden_dim)
|
| 28 |
-
self.protein_norm = nn.LayerNorm(hidden_dim)
|
| 29 |
-
self.smiles_norm = nn.LayerNorm(hidden_dim)
|
| 30 |
-
|
| 31 |
-
# Cross attention blocks with layer norm
|
| 32 |
-
self.cross_attention_layers = nn.ModuleList([
|
| 33 |
-
nn.ModuleDict({
|
| 34 |
-
'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
|
| 35 |
-
'norm1': nn.LayerNorm(hidden_dim),
|
| 36 |
-
'ffn': nn.Sequential(
|
| 37 |
-
nn.Linear(hidden_dim, hidden_dim * 4),
|
| 38 |
-
nn.ReLU(),
|
| 39 |
-
nn.Dropout(dropout),
|
| 40 |
-
nn.Linear(hidden_dim * 4, hidden_dim)
|
| 41 |
-
),
|
| 42 |
-
'norm2': nn.LayerNorm(hidden_dim)
|
| 43 |
-
}) for _ in range(n_layers)
|
| 44 |
-
])
|
| 45 |
-
|
| 46 |
-
# Prediction heads
|
| 47 |
-
self.shared_head = nn.Sequential(
|
| 48 |
-
nn.Linear(hidden_dim * 2, hidden_dim),
|
| 49 |
-
nn.ReLU(),
|
| 50 |
-
nn.Dropout(dropout),
|
| 51 |
-
)
|
| 52 |
-
|
| 53 |
-
# Regression head
|
| 54 |
-
self.regression_head = nn.Linear(hidden_dim, 1)
|
| 55 |
-
|
| 56 |
-
# Classification head (3 classes: tight, medium, loose binding)
|
| 57 |
-
self.classification_head = nn.Linear(hidden_dim, 3)
|
| 58 |
-
|
| 59 |
-
def get_binding_class(self, affinity):
|
| 60 |
-
"""Convert affinity values to class indices
|
| 61 |
-
0: tight binding (>= 7.5)
|
| 62 |
-
1: medium binding (6.0-7.5)
|
| 63 |
-
2: weak binding (< 6.0)
|
| 64 |
-
"""
|
| 65 |
-
if isinstance(affinity, torch.Tensor):
|
| 66 |
-
tight_mask = affinity >= self.tight_threshold
|
| 67 |
-
weak_mask = affinity < self.weak_threshold
|
| 68 |
-
medium_mask = ~(tight_mask | weak_mask)
|
| 69 |
-
|
| 70 |
-
classes = torch.zeros_like(affinity, dtype=torch.long)
|
| 71 |
-
classes[medium_mask] = 1
|
| 72 |
-
classes[weak_mask] = 2
|
| 73 |
-
return classes
|
| 74 |
-
else:
|
| 75 |
-
if affinity >= self.tight_threshold:
|
| 76 |
-
return 0 # tight binding
|
| 77 |
-
elif affinity < self.weak_threshold:
|
| 78 |
-
return 2 # weak binding
|
| 79 |
-
else:
|
| 80 |
-
return 1 # medium binding
|
| 81 |
-
|
| 82 |
-
def forward(self, protein_emb, smiles_emb):
|
| 83 |
-
protein = self.protein_norm(self.protein_projection(protein_emb))
|
| 84 |
-
smiles = self.smiles_norm(self.smiles_projection(smiles_emb))
|
| 85 |
-
|
| 86 |
-
#protein = protein.transpose(0, 1)
|
| 87 |
-
#smiles = smiles.transpose(0, 1)
|
| 88 |
-
|
| 89 |
-
# Cross attention layers
|
| 90 |
-
for layer in self.cross_attention_layers:
|
| 91 |
-
# Protein attending to SMILES
|
| 92 |
-
attended_protein = layer['attention'](
|
| 93 |
-
protein, smiles, smiles
|
| 94 |
-
)[0]
|
| 95 |
-
protein = layer['norm1'](protein + attended_protein)
|
| 96 |
-
protein = layer['norm2'](protein + layer['ffn'](protein))
|
| 97 |
-
|
| 98 |
-
# SMILES attending to protein
|
| 99 |
-
attended_smiles = layer['attention'](
|
| 100 |
-
smiles, protein, protein
|
| 101 |
-
)[0]
|
| 102 |
-
smiles = layer['norm1'](smiles + attended_smiles)
|
| 103 |
-
smiles = layer['norm2'](smiles + layer['ffn'](smiles))
|
| 104 |
-
|
| 105 |
-
# Get sequence-level representations
|
| 106 |
-
protein_pool = torch.mean(protein, dim=0)
|
| 107 |
-
smiles_pool = torch.mean(smiles, dim=0)
|
| 108 |
-
|
| 109 |
-
# Concatenate both representations
|
| 110 |
-
combined = torch.cat([protein_pool, smiles_pool], dim=-1)
|
| 111 |
-
|
| 112 |
-
# Shared features
|
| 113 |
-
shared_features = self.shared_head(combined)
|
| 114 |
-
|
| 115 |
-
regression_output = self.regression_head(shared_features)
|
| 116 |
-
classification_logits = self.classification_head(shared_features)
|
| 117 |
-
|
| 118 |
-
return regression_output, classification_logits
|
| 119 |
-
|
| 120 |
-
class BindingAffinity:
|
| 121 |
-
def __init__(self, prot_seq, model_type='PeptideCLM'):
|
| 122 |
-
super().__init__()
|
| 123 |
-
|
| 124 |
-
# peptide embeddings
|
| 125 |
-
self.pep_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
|
| 126 |
-
self.pep_tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
|
| 127 |
-
f'{base_path}/functions/tokenizer/new_splits.txt')
|
| 128 |
-
self.model = ImprovedBindingPredictor()
|
| 129 |
-
checkpoint = torch.load(f'{base_path}/src/binding/best_model.pt', weights_only=False)
|
| 130 |
-
self.model.load_state_dict(checkpoint['model_state_dict'])
|
| 131 |
-
|
| 132 |
-
self.model.eval()
|
| 133 |
-
|
| 134 |
-
self.esm_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D() # load ESM-2 model
|
| 135 |
-
self.prot_tokenizer = alphabet.get_batch_converter() # load esm tokenizer
|
| 136 |
-
|
| 137 |
-
data = [("target", prot_seq)]
|
| 138 |
-
# get tokenized protein
|
| 139 |
-
_, _, prot_tokens = self.prot_tokenizer(data)
|
| 140 |
-
with torch.no_grad():
|
| 141 |
-
results = self.esm_model.forward(prot_tokens, repr_layers=[33]) # Example with ESM-2
|
| 142 |
-
prot_emb = results["representations"][33]
|
| 143 |
-
|
| 144 |
-
self.prot_emb = prot_emb[0]
|
| 145 |
-
self.prot_emb = torch.mean(self.prot_emb, dim=0, keepdim=True)
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
def forward(self, input_seqs):
|
| 149 |
-
with torch.no_grad():
|
| 150 |
-
scores = []
|
| 151 |
-
for seq in input_seqs:
|
| 152 |
-
pep_tokens = self.pep_tokenizer(seq, return_tensors='pt', padding=True)
|
| 153 |
-
|
| 154 |
-
with torch.no_grad():
|
| 155 |
-
emb = self.pep_model(input_ids=pep_tokens['input_ids'],
|
| 156 |
-
attention_mask=pep_tokens['attention_mask'],
|
| 157 |
-
output_hidden_states=True)
|
| 158 |
-
|
| 159 |
-
#emb = self.pep_model(input_ids=pep_tokens['input_ids'], attention_mask=pep_tokens['attention_mask'])
|
| 160 |
-
pep_emb = emb.last_hidden_state.squeeze(0)
|
| 161 |
-
pep_emb = torch.mean(pep_emb, dim=0, keepdim=True)
|
| 162 |
-
|
| 163 |
-
score, logits = self.model.forward(self.prot_emb, pep_emb)
|
| 164 |
-
scores.append(score.item())
|
| 165 |
-
return scores
|
| 166 |
-
|
| 167 |
-
def __call__(self, input_seqs: list):
|
| 168 |
-
return self.forward(input_seqs)
|
| 169 |
-
|
| 170 |
-
def unittest():
|
| 171 |
-
amhr = 'MLGSLGLWALLPTAVEAPPNRRTCVFFEAPGVRGSTKTLGELLDTGTELPRAIRCLYSRCCFGIWNLTQDRAQVEMQGCRDSDEPGCESLHCDPSPRAHPSPGSTLFTCSCGTDFCNANYSHLPPPGSPGTPGSQGPQAAPGESIWMALVLLGLFLLLLLLLGSIILALLQRKNYRVRGEPVPEPRPDSGRDWSVELQELPELCFSQVIREGGHAVVWAGQLQGKLVAIKAFPPRSVAQFQAERALYELPGLQHDHIVRFITASRGGPGRLLSGPLLVLELHPKGSLCHYLTQYTSDWGSSLRMALSLAQGLAFLHEERWQNGQYKPGIAHRDLSSQNVLIREDGSCAIGDLGLALVLPGLTQPPAWTPTQPQGPAAIMEAGTQRYMAPELLDKTLDLQDWGMALRRADIYSLALLLWEILSRCPDLRPDSSPPPFQLAYEAELGNTPTSDELWALAVQERRRPYIPSTWRCFATDPDGLRELLEDCWDADPEARLTAECVQQRLAALAHPQESHPFPESCPRGCPPLCPEDCTSIPAPTILPCRPQRSACHFSVQQGPCSRNPQPACTLSPV'
|
| 172 |
-
tfr = 'MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKANVTKPKRCSGSICYGTIAVIVFFLIGFMIGYLGYCKGVEPKTECERLAGTESPVREEPGEDFPAARRLYWDDLKRKLSEKLDSTDFTGTIKLLNENSYVPREAGSQKDENLALYVENQFREFKLSKVWRDQHFVKIQVKDSAQNSVIIVDKNGRLVYLVENPGGYVAYSKAATVTGKLVHANFGTKKDFEDLYTPVNGSIVIVRAGKITFAEKVANAESLNAIGVLIYMDQTKFPIVNAELSFFGHAHLGTGDPYTPGFPSFNHTQFPPSRSSGLPNIPVQTISRAAAEKLFGNMEGDCPSDWKTDSTCRMVTSESKNVKLTVSNVLKEIKILNIFGVIKGFVEPDHYVVVGAQRDAWGPGAAKSGVGTALLLKLAQMFSDMVLKDGFQPSRSIIFASWSAGDFGSVGATEWLEGYLSSLHLKAFTYINLDKAVLGTSNFKVSASPLLYTLIEKTMQNVKHPVTGQFLYQDSNWASKVEKLTLDNAAFPFLAYSGIPAVSFCFCEDTDYPYLGTTMDTYKELIERIPELNKVARAAAEVAGQFVIKLTHDVELNLDYERYNSQLLSFVRDLNQYRADIKEMGLSLQWLYSARGDFFRATSRLTTDFGNAEKTDRFVMKKLNDRVMRVEYHFLSPYVSPKESPFRHVFWGSGSHTLPALLENLKLRKQNNGAFNETLFRNQLALATWTIQGAANALSGDVWDIDNEF'
|
| 173 |
-
gfap = 'MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPLPTRVDFSLAGALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEPTKLADVYQAELRELRLRLDQLTANSARLEVERDNLAQDLATVRQKLQDETNLRLEAENNLAAYRQEADEATLARLDLERKIESLEEEIRFLRKIHEEEVRELQEQLARQQVHVELDVAKPDLTAALKEIRTQYEAMASSNMHEAEEWYRSKFADLTDAAARNAELLRQAKHEANDYRRQLQSLTCDLESLRGTNESLERQMREQEERHVREAASYQEALARLEEEGQSLKDEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEENRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKESKQEHKDVM'
|
| 174 |
-
glp1 = 'MAGAPGPLRLALLLLGMVGRAGPRPQGATVSLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLQKDNSSLPWRDLSECEESKRGERSSPEEQLLFLYIIYTVGYALSFSALVIASAILLGFRHLHCTRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQQHQWDGLLSYQDSLSCRLVFLLMQYCVAANYYWLLVEGVYLYTLLAFSVLSEQWIFRLYVSIGWGVPLLFVVPWGIVKYLYEDEGCWTRNSNMNYWLIIRLPILFAIGVNFLIFVRVICIVVSKLKANLMCKTDIKCRLAKSTLTLIPLLGTHEVIFAFVMDEHARGTLRFIKLFTELSFTSFQGLMVAILYCFVNNEVQLEFRKSWERWRLEHLHIQRDSSMKPLKCPTSSLSSGATAGSSMYTATCQASCS'
|
| 175 |
-
glast = 'MTKSNGEEPKMGGRMERFQQGVRKRTLLAKKKVQNITKEDVKSYLFRNAFVLLTVTAVIVGTILGFTLRPYRMSYREVKYFSFPGELLMRMLQMLVLPLIISSLVTGMAALDSKASGKMGMRAVVYYMTTTIIAVVIGIIIVIIIHPGKGTKENMHREGKIVRVTAADAFLDLIRNMFPPNLVEACFKQFKTNYEKRSFKVPIQANETLVGAVINNVSEAMETLTRITEELVPVPGSVNGVNALGLVVFSMCFGFVIGNMKEQGQALREFFDSLNEAIMRLVAVIMWYAPVGILFLIAGKIVEMEDMGVIGGQLAMYTVTVIVGLLIHAVIVLPLLYFLVTRKNPWVFIGGLLQALITALGTSSSSATLPITFKCLEENNGVDKRVTRFVLPVGATINMDGTALYEALAAIFIAQVNNFELNFGQIITISITATAASIGAAGIPQAGLVTMVIVLTSVGLPTDDITLIIAVDWFLDRLRTTTNVLGDSLGAGIVEHLSRHELKNRDVEMGNSVIEENEMKKPYQLIAQDNETEKPIDSETKM'
|
| 176 |
-
ncam = 'LQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEF'
|
| 177 |
-
|
| 178 |
-
binding = BindingAffinity(tfr)
|
| 179 |
-
|
| 180 |
-
seq = ["CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)O"]
|
| 181 |
-
|
| 182 |
-
scores = binding(seq)
|
| 183 |
-
print(scores)
|
| 184 |
-
|
| 185 |
-
if __name__ == '__main__':
|
| 186 |
-
unittest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/hemolysis.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
import os
|
| 3 |
-
import xgboost as xgb
|
| 4 |
-
import torch
|
| 5 |
-
import numpy as np
|
| 6 |
-
from transformers import AutoModelForMaskedLM
|
| 7 |
-
from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
|
| 8 |
-
import warnings
|
| 9 |
-
import numpy as np
|
| 10 |
-
from rdkit.Chem import Descriptors, rdMolDescriptors
|
| 11 |
-
from rdkit import Chem, rdBase, DataStructs
|
| 12 |
-
from rdkit.Chem import AllChem
|
| 13 |
-
from typing import List
|
| 14 |
-
|
| 15 |
-
rdBase.DisableLog('rdApp.error')
|
| 16 |
-
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
| 17 |
-
warnings.filterwarnings("ignore", category=UserWarning)
|
| 18 |
-
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 19 |
-
|
| 20 |
-
base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
|
| 21 |
-
|
| 22 |
-
class Hemolysis:
|
| 23 |
-
|
| 24 |
-
def __init__(self):
|
| 25 |
-
self.predictor = xgb.Booster(model_file=f'{base_path}/src/best_model_f1.json')
|
| 26 |
-
self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
|
| 27 |
-
self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
|
| 28 |
-
f'{base_path}/functions/tokenizer/new_splits.txt')
|
| 29 |
-
def generate_embeddings(self, sequences):
|
| 30 |
-
embeddings = []
|
| 31 |
-
for sequence in sequences:
|
| 32 |
-
tokenized = self.tokenizer(sequence, return_tensors='pt')
|
| 33 |
-
with torch.no_grad():
|
| 34 |
-
output = self.emb_model(**tokenized)
|
| 35 |
-
# Mean pooling across sequence length
|
| 36 |
-
embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
|
| 37 |
-
embeddings.append(embedding)
|
| 38 |
-
return np.array(embeddings)
|
| 39 |
-
|
| 40 |
-
def get_scores(self, input_seqs: list):
|
| 41 |
-
scores = np.ones(len(input_seqs))
|
| 42 |
-
features = self.generate_embeddings(input_seqs)
|
| 43 |
-
|
| 44 |
-
if len(features) == 0:
|
| 45 |
-
return scores
|
| 46 |
-
|
| 47 |
-
features = np.nan_to_num(features, nan=0.)
|
| 48 |
-
features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
|
| 49 |
-
|
| 50 |
-
features = xgb.DMatrix(features)
|
| 51 |
-
|
| 52 |
-
probs = self.predictor.predict(features)
|
| 53 |
-
# return the probability of it being not hemolytic
|
| 54 |
-
return scores - probs
|
| 55 |
-
|
| 56 |
-
def __call__(self, input_seqs: list):
|
| 57 |
-
scores = self.get_scores(input_seqs)
|
| 58 |
-
return scores
|
| 59 |
-
|
| 60 |
-
def unittest():
|
| 61 |
-
hemo = Hemolysis()
|
| 62 |
-
seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
|
| 63 |
-
|
| 64 |
-
scores = hemo(input_seqs=seq)
|
| 65 |
-
print(scores)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
if __name__ == '__main__':
|
| 69 |
-
unittest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/nonfouling.py
DELETED
|
@@ -1,69 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
import os
|
| 3 |
-
import xgboost as xgb
|
| 4 |
-
import torch
|
| 5 |
-
import numpy as np
|
| 6 |
-
from transformers import AutoModelForMaskedLM
|
| 7 |
-
from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
|
| 8 |
-
import warnings
|
| 9 |
-
import numpy as np
|
| 10 |
-
from rdkit import Chem, rdBase, DataStructs
|
| 11 |
-
from transformers import AutoModelForMaskedLM
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
rdBase.DisableLog('rdApp.error')
|
| 15 |
-
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
| 16 |
-
warnings.filterwarnings("ignore", category=UserWarning)
|
| 17 |
-
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 18 |
-
|
| 19 |
-
base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
|
| 20 |
-
|
| 21 |
-
class Nonfouling:
|
| 22 |
-
|
| 23 |
-
def __init__(self):
|
| 24 |
-
self.predictor = xgb.Booster(model_file=f'{base_path}/src/nonfouling/best_model_f1.json')
|
| 25 |
-
self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
|
| 26 |
-
self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
|
| 27 |
-
f'{base_path}/functions/tokenizer/new_splits.txt')
|
| 28 |
-
|
| 29 |
-
def generate_embeddings(self, sequences):
|
| 30 |
-
embeddings = []
|
| 31 |
-
for sequence in sequences:
|
| 32 |
-
tokenized = self.tokenizer(sequence, return_tensors='pt')
|
| 33 |
-
with torch.no_grad():
|
| 34 |
-
output = self.emb_model(**tokenized)
|
| 35 |
-
# Mean pooling across sequence length
|
| 36 |
-
embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
|
| 37 |
-
embeddings.append(embedding)
|
| 38 |
-
return np.array(embeddings)
|
| 39 |
-
|
| 40 |
-
def get_scores(self, input_seqs: list):
|
| 41 |
-
scores = np.zeros(len(input_seqs))
|
| 42 |
-
features = self.generate_embeddings(input_seqs)
|
| 43 |
-
|
| 44 |
-
if len(features) == 0:
|
| 45 |
-
return scores
|
| 46 |
-
|
| 47 |
-
features = np.nan_to_num(features, nan=0.)
|
| 48 |
-
features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
|
| 49 |
-
|
| 50 |
-
features = xgb.DMatrix(features)
|
| 51 |
-
|
| 52 |
-
scores = self.predictor.predict(features)
|
| 53 |
-
# return the probability of it being not hemolytic
|
| 54 |
-
return scores
|
| 55 |
-
|
| 56 |
-
def __call__(self, input_seqs: list):
|
| 57 |
-
scores = self.get_scores(input_seqs)
|
| 58 |
-
return scores
|
| 59 |
-
|
| 60 |
-
def unittest():
|
| 61 |
-
nf = Nonfouling()
|
| 62 |
-
seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
|
| 63 |
-
|
| 64 |
-
scores = nf(input_seqs=seq)
|
| 65 |
-
print(scores)
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
if __name__ == '__main__':
|
| 69 |
-
unittest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/permeability.py
DELETED
|
@@ -1,167 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
import os
|
| 3 |
-
import xgboost as xgb
|
| 4 |
-
import torch
|
| 5 |
-
import numpy as np
|
| 6 |
-
from transformers import AutoModelForMaskedLM
|
| 7 |
-
from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
|
| 8 |
-
import warnings
|
| 9 |
-
import numpy as np
|
| 10 |
-
from rdkit.Chem import Descriptors, rdMolDescriptors
|
| 11 |
-
from rdkit import Chem, rdBase, DataStructs
|
| 12 |
-
from rdkit.Chem import AllChem
|
| 13 |
-
from typing import List
|
| 14 |
-
|
| 15 |
-
base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
|
| 16 |
-
|
| 17 |
-
rdBase.DisableLog('rdApp.error')
|
| 18 |
-
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
| 19 |
-
warnings.filterwarnings("ignore", category=UserWarning)
|
| 20 |
-
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 21 |
-
|
| 22 |
-
def fingerprints_from_smiles(smiles: List, size=2048):
|
| 23 |
-
""" Create ECFP fingerprints of smiles, with validity check """
|
| 24 |
-
fps = []
|
| 25 |
-
valid_mask = []
|
| 26 |
-
for i, smile in enumerate(smiles):
|
| 27 |
-
mol = Chem.MolFromSmiles(smile)
|
| 28 |
-
valid_mask.append(int(mol is not None))
|
| 29 |
-
fp = fingerprints_from_mol(mol, size=size) if mol else np.zeros((1, size))
|
| 30 |
-
fps.append(fp)
|
| 31 |
-
|
| 32 |
-
fps = np.concatenate(fps, axis=0)
|
| 33 |
-
return fps, valid_mask
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
def fingerprints_from_mol(molecule, radius=3, size=2048, hashed=False):
|
| 37 |
-
""" Create ECFP fingerprint of a molecule """
|
| 38 |
-
if hashed:
|
| 39 |
-
fp_bits = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=size)
|
| 40 |
-
else:
|
| 41 |
-
fp_bits = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=size)
|
| 42 |
-
fp_np = np.zeros((1,))
|
| 43 |
-
DataStructs.ConvertToNumpyArray(fp_bits, fp_np)
|
| 44 |
-
return fp_np.reshape(1, -1)
|
| 45 |
-
|
| 46 |
-
def getMolDescriptors(mol, missingVal=0):
|
| 47 |
-
""" calculate the full list of descriptors for a molecule """
|
| 48 |
-
|
| 49 |
-
values, names = [], []
|
| 50 |
-
for nm, fn in Descriptors._descList:
|
| 51 |
-
try:
|
| 52 |
-
val = fn(mol)
|
| 53 |
-
except:
|
| 54 |
-
val = missingVal
|
| 55 |
-
values.append(val)
|
| 56 |
-
names.append(nm)
|
| 57 |
-
|
| 58 |
-
custom_descriptors = {'hydrogen-bond donors': rdMolDescriptors.CalcNumLipinskiHBD,
|
| 59 |
-
'hydrogen-bond acceptors': rdMolDescriptors.CalcNumLipinskiHBA,
|
| 60 |
-
'rotatable bonds': rdMolDescriptors.CalcNumRotatableBonds,}
|
| 61 |
-
|
| 62 |
-
for nm, fn in custom_descriptors.items():
|
| 63 |
-
try:
|
| 64 |
-
val = fn(mol)
|
| 65 |
-
except:
|
| 66 |
-
val = missingVal
|
| 67 |
-
values.append(val)
|
| 68 |
-
names.append(nm)
|
| 69 |
-
return values, names
|
| 70 |
-
|
| 71 |
-
def get_pep_dps_from_smi(smi):
|
| 72 |
-
try:
|
| 73 |
-
mol = Chem.MolFromSmiles(smi)
|
| 74 |
-
except:
|
| 75 |
-
print(f"convert smi {smi} to molecule failed!")
|
| 76 |
-
mol = None
|
| 77 |
-
|
| 78 |
-
dps, _ = getMolDescriptors(mol)
|
| 79 |
-
return np.array(dps)
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
def get_pep_dps(smi_list):
|
| 83 |
-
if len(smi_list) == 0:
|
| 84 |
-
return np.zeros((0, 213))
|
| 85 |
-
return np.array([get_pep_dps_from_smi(smi) for smi in smi_list])
|
| 86 |
-
|
| 87 |
-
def check_smi_validity(smiles: list):
|
| 88 |
-
valid_smi, valid_idx = [], []
|
| 89 |
-
for idx, smi in enumerate(smiles):
|
| 90 |
-
try:
|
| 91 |
-
mol = Chem.MolFromSmiles(smi) if smi else None
|
| 92 |
-
if mol:
|
| 93 |
-
valid_smi.append(smi)
|
| 94 |
-
valid_idx.append(idx)
|
| 95 |
-
except Exception as e:
|
| 96 |
-
# logger.debug(f'Error: {e} in smiles {smi}')
|
| 97 |
-
pass
|
| 98 |
-
return valid_smi, valid_idx
|
| 99 |
-
|
| 100 |
-
class Permeability:
|
| 101 |
-
|
| 102 |
-
def __init__(self):
|
| 103 |
-
self.predictor = xgb.Booster(model_file=f'{base_path}/src/permeability/best_model.json')
|
| 104 |
-
self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
|
| 105 |
-
self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
|
| 106 |
-
f'{base_path}/functions/tokenizer/new_splits.txt')
|
| 107 |
-
|
| 108 |
-
def generate_embeddings(self, sequences):
|
| 109 |
-
embeddings = []
|
| 110 |
-
for sequence in sequences:
|
| 111 |
-
tokenized = self.tokenizer(sequence, return_tensors='pt')
|
| 112 |
-
with torch.no_grad():
|
| 113 |
-
output = self.emb_model(**tokenized)
|
| 114 |
-
# Mean pooling across sequence length
|
| 115 |
-
embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
|
| 116 |
-
embeddings.append(embedding)
|
| 117 |
-
return np.array(embeddings)
|
| 118 |
-
|
| 119 |
-
def get_features(self, input_seqs: list, dps=False, fps=False):
|
| 120 |
-
#valid_smiles, valid_idxes = check_smi_validity(input_seqs)
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
if fps:
|
| 124 |
-
fingerprints = fingerprints_from_smiles(input_seqs)[0]
|
| 125 |
-
else:
|
| 126 |
-
fingerprints = torch.empty((len(input_seqs), 0))
|
| 127 |
-
|
| 128 |
-
if dps:
|
| 129 |
-
descriptors = get_pep_dps(input_seqs)
|
| 130 |
-
else:
|
| 131 |
-
descriptors = torch.empty((len(input_seqs), 0))
|
| 132 |
-
|
| 133 |
-
embeddings = self.generate_embeddings(input_seqs)
|
| 134 |
-
# logger.debug(f'X_fps.shape: {X_fps.shape}, X_dps.shape: {X_dps.shape}')
|
| 135 |
-
|
| 136 |
-
features = np.concatenate([fingerprints, descriptors, embeddings], axis=1)
|
| 137 |
-
|
| 138 |
-
return features
|
| 139 |
-
|
| 140 |
-
def get_scores(self, input_seqs: list):
|
| 141 |
-
scores = -10 * np.ones(len(input_seqs))
|
| 142 |
-
features = self.get_features(input_seqs)
|
| 143 |
-
|
| 144 |
-
if len(features) == 0:
|
| 145 |
-
return scores
|
| 146 |
-
|
| 147 |
-
features = np.nan_to_num(features, nan=0.)
|
| 148 |
-
features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
|
| 149 |
-
|
| 150 |
-
features = xgb.DMatrix(features)
|
| 151 |
-
|
| 152 |
-
scores = self.predictor.predict(features)
|
| 153 |
-
return scores
|
| 154 |
-
|
| 155 |
-
def __call__(self, input_seqs: list):
|
| 156 |
-
scores = self.get_scores(input_seqs)
|
| 157 |
-
return scores
|
| 158 |
-
|
| 159 |
-
def unittest():
|
| 160 |
-
permeability = Permeability()
|
| 161 |
-
seq = ['N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1cNc2c1cc(O)cc2)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H]([C@@H](O)C(C)C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CC(=CN2)C1=C2C=CC=C1)C(=O)O']
|
| 162 |
-
scores = permeability(input_seqs=seq)
|
| 163 |
-
print(scores)
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
if __name__ == '__main__':
|
| 167 |
-
unittest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/solubility.py
DELETED
|
@@ -1,68 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
import os
|
| 3 |
-
import xgboost as xgb
|
| 4 |
-
import torch
|
| 5 |
-
import numpy as np
|
| 6 |
-
from transformers import AutoModelForMaskedLM
|
| 7 |
-
from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
|
| 8 |
-
import warnings
|
| 9 |
-
import numpy as np
|
| 10 |
-
from rdkit.Chem import Descriptors, rdMolDescriptors
|
| 11 |
-
from rdkit import Chem, rdBase, DataStructs
|
| 12 |
-
from rdkit.Chem import AllChem
|
| 13 |
-
from typing import List
|
| 14 |
-
from transformers import AutoModelForMaskedLM
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
rdBase.DisableLog('rdApp.error')
|
| 18 |
-
warnings.filterwarnings("ignore", category=DeprecationWarning)
|
| 19 |
-
warnings.filterwarnings("ignore", category=UserWarning)
|
| 20 |
-
warnings.filterwarnings("ignore", category=FutureWarning)
|
| 21 |
-
|
| 22 |
-
base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
|
| 23 |
-
|
| 24 |
-
class Solubility:
|
| 25 |
-
def __init__(self):
|
| 26 |
-
self.predictor = xgb.Booster(model_file=f'{base_path}/src/solubility/best_model_f1.json')
|
| 27 |
-
self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
|
| 28 |
-
self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
|
| 29 |
-
f'{base_path}/functions/tokenizer/new_splits.txt')
|
| 30 |
-
|
| 31 |
-
def generate_embeddings(self, sequences):
|
| 32 |
-
embeddings = []
|
| 33 |
-
for sequence in sequences:
|
| 34 |
-
tokenized = self.tokenizer(sequence, return_tensors='pt')
|
| 35 |
-
with torch.no_grad():
|
| 36 |
-
output = self.emb_model(**tokenized)
|
| 37 |
-
# Mean pooling across sequence length
|
| 38 |
-
embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
|
| 39 |
-
embeddings.append(embedding)
|
| 40 |
-
return np.array(embeddings)
|
| 41 |
-
|
| 42 |
-
def get_scores(self, input_seqs: list):
|
| 43 |
-
scores = np.zeros(len(input_seqs))
|
| 44 |
-
features = self.generate_embeddings(input_seqs)
|
| 45 |
-
|
| 46 |
-
if len(features) == 0:
|
| 47 |
-
return scores
|
| 48 |
-
|
| 49 |
-
features = np.nan_to_num(features, nan=0.)
|
| 50 |
-
features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
|
| 51 |
-
|
| 52 |
-
features = xgb.DMatrix(features)
|
| 53 |
-
|
| 54 |
-
scores = self.predictor.predict(features)
|
| 55 |
-
return scores
|
| 56 |
-
|
| 57 |
-
def __call__(self, input_seqs: list):
|
| 58 |
-
scores = self.get_scores(input_seqs)
|
| 59 |
-
return scores
|
| 60 |
-
|
| 61 |
-
def unittest():
|
| 62 |
-
solubility = Solubility()
|
| 63 |
-
seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
|
| 64 |
-
scores = solubility(input_seqs=seq)
|
| 65 |
-
print(scores)
|
| 66 |
-
|
| 67 |
-
if __name__ == '__main__':
|
| 68 |
-
unittest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc
DELETED
|
Binary file (15.5 kB)
|
|
|
functions/tokenizer/my_tokenizers.py
DELETED
|
@@ -1,398 +0,0 @@
|
|
| 1 |
-
import collections
|
| 2 |
-
import logging
|
| 3 |
-
import os
|
| 4 |
-
import re
|
| 5 |
-
import codecs
|
| 6 |
-
import unicodedata
|
| 7 |
-
from typing import List, Optional
|
| 8 |
-
from transformers import PreTrainedTokenizer
|
| 9 |
-
from SmilesPE.tokenizer import SPE_Tokenizer
|
| 10 |
-
|
| 11 |
-
def load_vocab(vocab_file):
|
| 12 |
-
"""Loads a vocabulary file into a dictionary."""
|
| 13 |
-
vocab = collections.OrderedDict()
|
| 14 |
-
with open(vocab_file, "r", encoding="utf-8") as reader:
|
| 15 |
-
tokens = reader.readlines()
|
| 16 |
-
for index, token in enumerate(tokens):
|
| 17 |
-
token = token.rstrip("\n")
|
| 18 |
-
vocab[token] = index
|
| 19 |
-
return vocab
|
| 20 |
-
|
| 21 |
-
class Atomwise_Tokenizer(object):
|
| 22 |
-
"""Run atom-level SMILES tokenization"""
|
| 23 |
-
|
| 24 |
-
def __init__(self):
|
| 25 |
-
""" Constructs a atom-level Tokenizer.
|
| 26 |
-
"""
|
| 27 |
-
# self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
|
| 28 |
-
self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
|
| 29 |
-
|
| 30 |
-
self.regex = re.compile(self.regex_pattern)
|
| 31 |
-
|
| 32 |
-
def tokenize(self, text):
|
| 33 |
-
""" Basic Tokenization of a SMILES.
|
| 34 |
-
"""
|
| 35 |
-
tokens = [token for token in self.regex.findall(text)]
|
| 36 |
-
return tokens
|
| 37 |
-
|
| 38 |
-
class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
|
| 39 |
-
r"""
|
| 40 |
-
Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
|
| 41 |
-
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
| 42 |
-
should refer to the superclass for more information regarding methods.
|
| 43 |
-
Args:
|
| 44 |
-
vocab_file (:obj:`string`):
|
| 45 |
-
File containing the vocabulary.
|
| 46 |
-
spe_file (:obj:`string`):
|
| 47 |
-
File containing the trained SMILES Pair Encoding vocabulary.
|
| 48 |
-
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
|
| 49 |
-
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
| 50 |
-
token instead.
|
| 51 |
-
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
|
| 52 |
-
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
| 53 |
-
for sequence classification or for a text and a question for question answering.
|
| 54 |
-
It is also used as the last token of a sequence built with special tokens.
|
| 55 |
-
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
|
| 56 |
-
The token used for padding, for example when batching sequences of different lengths.
|
| 57 |
-
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
|
| 58 |
-
The classifier token which is used when doing sequence classification (classification of the whole
|
| 59 |
-
sequence instead of per-token classification). It is the first token of the sequence when built with
|
| 60 |
-
special tokens.
|
| 61 |
-
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
|
| 62 |
-
The token used for masking values. This is the token used when training this model with masked language
|
| 63 |
-
modeling. This is the token which the model will try to predict.
|
| 64 |
-
"""
|
| 65 |
-
|
| 66 |
-
def __init__(self, vocab_file, spe_file,
|
| 67 |
-
unk_token="[UNK]",
|
| 68 |
-
sep_token="[SEP]",
|
| 69 |
-
pad_token="[PAD]",
|
| 70 |
-
cls_token="[CLS]",
|
| 71 |
-
mask_token="[MASK]",
|
| 72 |
-
**kwargs):
|
| 73 |
-
if not os.path.isfile(vocab_file):
|
| 74 |
-
raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
|
| 75 |
-
if not os.path.isfile(spe_file):
|
| 76 |
-
raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
|
| 77 |
-
|
| 78 |
-
self.vocab = load_vocab(vocab_file)
|
| 79 |
-
self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
|
| 80 |
-
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
| 81 |
-
self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
|
| 82 |
-
|
| 83 |
-
super().__init__(
|
| 84 |
-
unk_token=unk_token,
|
| 85 |
-
sep_token=sep_token,
|
| 86 |
-
pad_token=pad_token,
|
| 87 |
-
cls_token=cls_token,
|
| 88 |
-
mask_token=mask_token,
|
| 89 |
-
**kwargs)
|
| 90 |
-
|
| 91 |
-
@property
|
| 92 |
-
def vocab_size(self):
|
| 93 |
-
return len(self.vocab)
|
| 94 |
-
|
| 95 |
-
def get_vocab(self):
|
| 96 |
-
return dict(self.vocab, **self.added_tokens_encoder)
|
| 97 |
-
|
| 98 |
-
def _tokenize(self, text):
|
| 99 |
-
return self.spe_tokenizer.tokenize(text).split(' ')
|
| 100 |
-
|
| 101 |
-
def _convert_token_to_id(self, token):
|
| 102 |
-
""" Converts a token (str) in an id using the vocab. """
|
| 103 |
-
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
| 104 |
-
|
| 105 |
-
def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
|
| 106 |
-
text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
|
| 107 |
-
return self.convert_tokens_to_string(text)
|
| 108 |
-
|
| 109 |
-
def _convert_id_to_token(self, index):
|
| 110 |
-
"""Converts an index (integer) in a token (str) using the vocab."""
|
| 111 |
-
return self.ids_to_tokens.get(index, self.unk_token)
|
| 112 |
-
|
| 113 |
-
def convert_tokens_to_string(self, tokens):
|
| 114 |
-
""" Converts a sequence of tokens (string) in a single string. """
|
| 115 |
-
out_string = " ".join(tokens).replace(" ##", "").strip()
|
| 116 |
-
return out_string
|
| 117 |
-
|
| 118 |
-
def build_inputs_with_special_tokens(
|
| 119 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 120 |
-
) -> List[int]:
|
| 121 |
-
"""
|
| 122 |
-
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
| 123 |
-
by concatenating and adding special tokens.
|
| 124 |
-
A BERT sequence has the following format:
|
| 125 |
-
- single sequence: ``[CLS] X [SEP]``
|
| 126 |
-
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
|
| 127 |
-
Args:
|
| 128 |
-
token_ids_0 (:obj:`List[int]`):
|
| 129 |
-
List of IDs to which the special tokens will be added
|
| 130 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 131 |
-
Optional second list of IDs for sequence pairs.
|
| 132 |
-
Returns:
|
| 133 |
-
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
| 134 |
-
"""
|
| 135 |
-
if token_ids_1 is None:
|
| 136 |
-
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
| 137 |
-
cls = [self.cls_token_id]
|
| 138 |
-
sep = [self.sep_token_id]
|
| 139 |
-
return cls + token_ids_0 + sep + token_ids_1 + sep
|
| 140 |
-
|
| 141 |
-
def get_special_tokens_mask(
|
| 142 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
| 143 |
-
) -> List[int]:
|
| 144 |
-
"""
|
| 145 |
-
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
| 146 |
-
special tokens using the tokenizer ``prepare_for_model`` method.
|
| 147 |
-
Args:
|
| 148 |
-
token_ids_0 (:obj:`List[int]`):
|
| 149 |
-
List of ids.
|
| 150 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 151 |
-
Optional second list of IDs for sequence pairs.
|
| 152 |
-
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
| 153 |
-
Set to True if the token list is already formatted with special tokens for the model
|
| 154 |
-
Returns:
|
| 155 |
-
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
| 156 |
-
"""
|
| 157 |
-
|
| 158 |
-
if already_has_special_tokens:
|
| 159 |
-
if token_ids_1 is not None:
|
| 160 |
-
raise ValueError(
|
| 161 |
-
"You should not supply a second sequence if the provided sequence of "
|
| 162 |
-
"ids is already formated with special tokens for the model."
|
| 163 |
-
)
|
| 164 |
-
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
| 165 |
-
|
| 166 |
-
if token_ids_1 is not None:
|
| 167 |
-
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
| 168 |
-
return [1] + ([0] * len(token_ids_0)) + [1]
|
| 169 |
-
|
| 170 |
-
def create_token_type_ids_from_sequences(
|
| 171 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 172 |
-
) -> List[int]:
|
| 173 |
-
"""
|
| 174 |
-
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
| 175 |
-
A BERT sequence pair mask has the following format:
|
| 176 |
-
::
|
| 177 |
-
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
| 178 |
-
| first sequence | second sequence |
|
| 179 |
-
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
| 180 |
-
Args:
|
| 181 |
-
token_ids_0 (:obj:`List[int]`):
|
| 182 |
-
List of ids.
|
| 183 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 184 |
-
Optional second list of IDs for sequence pairs.
|
| 185 |
-
Returns:
|
| 186 |
-
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
| 187 |
-
sequence(s).
|
| 188 |
-
"""
|
| 189 |
-
sep = [self.sep_token_id]
|
| 190 |
-
cls = [self.cls_token_id]
|
| 191 |
-
if token_ids_1 is None:
|
| 192 |
-
return len(cls + token_ids_0 + sep) * [0]
|
| 193 |
-
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
| 194 |
-
|
| 195 |
-
def save_vocabulary(self, vocab_path):
|
| 196 |
-
"""
|
| 197 |
-
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
| 198 |
-
Args:
|
| 199 |
-
vocab_path (:obj:`str`):
|
| 200 |
-
The directory in which to save the vocabulary.
|
| 201 |
-
Returns:
|
| 202 |
-
:obj:`Tuple(str)`: Paths to the files saved.
|
| 203 |
-
"""
|
| 204 |
-
index = 0
|
| 205 |
-
if os.path.isdir(vocab_path):
|
| 206 |
-
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
|
| 207 |
-
else:
|
| 208 |
-
vocab_file = vocab_path
|
| 209 |
-
with open(vocab_file, "w", encoding="utf-8") as writer:
|
| 210 |
-
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
| 211 |
-
if index != token_index:
|
| 212 |
-
logger.warning(
|
| 213 |
-
"Saving vocabulary to {}: vocabulary indices are not consecutive."
|
| 214 |
-
" Please check that the vocabulary is not corrupted!".format(vocab_file)
|
| 215 |
-
)
|
| 216 |
-
index = token_index
|
| 217 |
-
writer.write(token + "\n")
|
| 218 |
-
index += 1
|
| 219 |
-
return (vocab_file,)
|
| 220 |
-
|
| 221 |
-
class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
|
| 222 |
-
r"""
|
| 223 |
-
Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
|
| 224 |
-
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
|
| 225 |
-
should refer to the superclass for more information regarding methods.
|
| 226 |
-
Args:
|
| 227 |
-
vocab_file (:obj:`string`):
|
| 228 |
-
File containing the vocabulary.
|
| 229 |
-
unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
|
| 230 |
-
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
| 231 |
-
token instead.
|
| 232 |
-
sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
|
| 233 |
-
The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
|
| 234 |
-
for sequence classification or for a text and a question for question answering.
|
| 235 |
-
It is also used as the last token of a sequence built with special tokens.
|
| 236 |
-
pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
|
| 237 |
-
The token used for padding, for example when batching sequences of different lengths.
|
| 238 |
-
cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
|
| 239 |
-
The classifier token which is used when doing sequence classification (classification of the whole
|
| 240 |
-
sequence instead of per-token classification). It is the first token of the sequence when built with
|
| 241 |
-
special tokens.
|
| 242 |
-
mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
|
| 243 |
-
The token used for masking values. This is the token used when training this model with masked language
|
| 244 |
-
modeling. This is the token which the model will try to predict.
|
| 245 |
-
"""
|
| 246 |
-
|
| 247 |
-
def __init__(
|
| 248 |
-
self,
|
| 249 |
-
vocab_file,
|
| 250 |
-
unk_token="[UNK]",
|
| 251 |
-
sep_token="[SEP]",
|
| 252 |
-
pad_token="[PAD]",
|
| 253 |
-
cls_token="[CLS]",
|
| 254 |
-
mask_token="[MASK]",
|
| 255 |
-
**kwargs
|
| 256 |
-
):
|
| 257 |
-
super().__init__(
|
| 258 |
-
unk_token=unk_token,
|
| 259 |
-
sep_token=sep_token,
|
| 260 |
-
pad_token=pad_token,
|
| 261 |
-
cls_token=cls_token,
|
| 262 |
-
mask_token=mask_token,
|
| 263 |
-
**kwargs,
|
| 264 |
-
)
|
| 265 |
-
|
| 266 |
-
if not os.path.isfile(vocab_file):
|
| 267 |
-
raise ValueError(
|
| 268 |
-
"Can't find a vocabulary file at path '{}'.".format(vocab_file)
|
| 269 |
-
)
|
| 270 |
-
self.vocab = load_vocab(vocab_file)
|
| 271 |
-
self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
|
| 272 |
-
self.tokenizer = Atomwise_Tokenizer()
|
| 273 |
-
|
| 274 |
-
@property
|
| 275 |
-
def vocab_size(self):
|
| 276 |
-
return len(self.vocab)
|
| 277 |
-
|
| 278 |
-
def get_vocab(self):
|
| 279 |
-
return dict(self.vocab, **self.added_tokens_encoder)
|
| 280 |
-
|
| 281 |
-
def _tokenize(self, text):
|
| 282 |
-
return self.tokenizer.tokenize(text)
|
| 283 |
-
|
| 284 |
-
def _convert_token_to_id(self, token):
|
| 285 |
-
""" Converts a token (str) in an id using the vocab. """
|
| 286 |
-
return self.vocab.get(token, self.vocab.get(self.unk_token))
|
| 287 |
-
|
| 288 |
-
def _convert_id_to_token(self, index):
|
| 289 |
-
"""Converts an index (integer) in a token (str) using the vocab."""
|
| 290 |
-
return self.ids_to_tokens.get(index, self.unk_token)
|
| 291 |
-
|
| 292 |
-
def convert_tokens_to_string(self, tokens):
|
| 293 |
-
""" Converts a sequence of tokens (string) in a single string. """
|
| 294 |
-
out_string = " ".join(tokens).replace(" ##", "").strip()
|
| 295 |
-
return out_string
|
| 296 |
-
|
| 297 |
-
def build_inputs_with_special_tokens(
|
| 298 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 299 |
-
) -> List[int]:
|
| 300 |
-
"""
|
| 301 |
-
Build model inputs from a sequence or a pair of sequence for sequence classification tasks
|
| 302 |
-
by concatenating and adding special tokens.
|
| 303 |
-
A BERT sequence has the following format:
|
| 304 |
-
- single sequence: ``[CLS] X [SEP]``
|
| 305 |
-
- pair of sequences: ``[CLS] A [SEP] B [SEP]``
|
| 306 |
-
Args:
|
| 307 |
-
token_ids_0 (:obj:`List[int]`):
|
| 308 |
-
List of IDs to which the special tokens will be added
|
| 309 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 310 |
-
Optional second list of IDs for sequence pairs.
|
| 311 |
-
Returns:
|
| 312 |
-
:obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
|
| 313 |
-
"""
|
| 314 |
-
if token_ids_1 is None:
|
| 315 |
-
return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
|
| 316 |
-
cls = [self.cls_token_id]
|
| 317 |
-
sep = [self.sep_token_id]
|
| 318 |
-
return cls + token_ids_0 + sep + token_ids_1 + sep
|
| 319 |
-
|
| 320 |
-
def get_special_tokens_mask(
|
| 321 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
|
| 322 |
-
) -> List[int]:
|
| 323 |
-
"""
|
| 324 |
-
Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
|
| 325 |
-
special tokens using the tokenizer ``prepare_for_model`` method.
|
| 326 |
-
Args:
|
| 327 |
-
token_ids_0 (:obj:`List[int]`):
|
| 328 |
-
List of ids.
|
| 329 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 330 |
-
Optional second list of IDs for sequence pairs.
|
| 331 |
-
already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
|
| 332 |
-
Set to True if the token list is already formatted with special tokens for the model
|
| 333 |
-
Returns:
|
| 334 |
-
:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
|
| 335 |
-
"""
|
| 336 |
-
|
| 337 |
-
if already_has_special_tokens:
|
| 338 |
-
if token_ids_1 is not None:
|
| 339 |
-
raise ValueError(
|
| 340 |
-
"You should not supply a second sequence if the provided sequence of "
|
| 341 |
-
"ids is already formated with special tokens for the model."
|
| 342 |
-
)
|
| 343 |
-
return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
|
| 344 |
-
|
| 345 |
-
if token_ids_1 is not None:
|
| 346 |
-
return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
|
| 347 |
-
return [1] + ([0] * len(token_ids_0)) + [1]
|
| 348 |
-
|
| 349 |
-
def create_token_type_ids_from_sequences(
|
| 350 |
-
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
|
| 351 |
-
) -> List[int]:
|
| 352 |
-
"""
|
| 353 |
-
Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
|
| 354 |
-
A BERT sequence pair mask has the following format:
|
| 355 |
-
::
|
| 356 |
-
0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
|
| 357 |
-
| first sequence | second sequence |
|
| 358 |
-
if token_ids_1 is None, only returns the first portion of the mask (0's).
|
| 359 |
-
Args:
|
| 360 |
-
token_ids_0 (:obj:`List[int]`):
|
| 361 |
-
List of ids.
|
| 362 |
-
token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
|
| 363 |
-
Optional second list of IDs for sequence pairs.
|
| 364 |
-
Returns:
|
| 365 |
-
:obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
|
| 366 |
-
sequence(s).
|
| 367 |
-
"""
|
| 368 |
-
sep = [self.sep_token_id]
|
| 369 |
-
cls = [self.cls_token_id]
|
| 370 |
-
if token_ids_1 is None:
|
| 371 |
-
return len(cls + token_ids_0 + sep) * [0]
|
| 372 |
-
return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
|
| 373 |
-
|
| 374 |
-
def save_vocabulary(self, vocab_path):
|
| 375 |
-
"""
|
| 376 |
-
Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
|
| 377 |
-
Args:
|
| 378 |
-
vocab_path (:obj:`str`):
|
| 379 |
-
The directory in which to save the vocabulary.
|
| 380 |
-
Returns:
|
| 381 |
-
:obj:`Tuple(str)`: Paths to the files saved.
|
| 382 |
-
"""
|
| 383 |
-
index = 0
|
| 384 |
-
if os.path.isdir(vocab_path):
|
| 385 |
-
vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
|
| 386 |
-
else:
|
| 387 |
-
vocab_file = vocab_path
|
| 388 |
-
with open(vocab_file, "w", encoding="utf-8") as writer:
|
| 389 |
-
for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
|
| 390 |
-
if index != token_index:
|
| 391 |
-
logger.warning(
|
| 392 |
-
"Saving vocabulary to {}: vocabulary indices are not consecutive."
|
| 393 |
-
" Please check that the vocabulary is not corrupted!".format(vocab_file)
|
| 394 |
-
)
|
| 395 |
-
index = token_index
|
| 396 |
-
writer.write(token + "\n")
|
| 397 |
-
index += 1
|
| 398 |
-
return (vocab_file,)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/tokenizer/new_splits.txt
DELETED
|
@@ -1,159 +0,0 @@
|
|
| 1 |
-
c 1
|
| 2 |
-
c 2
|
| 3 |
-
c 3
|
| 4 |
-
c 4
|
| 5 |
-
c 5
|
| 6 |
-
c 6
|
| 7 |
-
c 7
|
| 8 |
-
c 8
|
| 9 |
-
c 9
|
| 10 |
-
( c1
|
| 11 |
-
( c2
|
| 12 |
-
c1 )
|
| 13 |
-
c2 )
|
| 14 |
-
n 1
|
| 15 |
-
n 2
|
| 16 |
-
n 3
|
| 17 |
-
n 4
|
| 18 |
-
n 5
|
| 19 |
-
n 6
|
| 20 |
-
n 7
|
| 21 |
-
n 8
|
| 22 |
-
n 9
|
| 23 |
-
( n1
|
| 24 |
-
( n2
|
| 25 |
-
n1 )
|
| 26 |
-
n2 )
|
| 27 |
-
O 1
|
| 28 |
-
O 2
|
| 29 |
-
O 3
|
| 30 |
-
O 4
|
| 31 |
-
O 5
|
| 32 |
-
O 6
|
| 33 |
-
O 7
|
| 34 |
-
O 8
|
| 35 |
-
O 9
|
| 36 |
-
( O1
|
| 37 |
-
( O2
|
| 38 |
-
O2 )
|
| 39 |
-
O2 )
|
| 40 |
-
= O
|
| 41 |
-
= C
|
| 42 |
-
= c
|
| 43 |
-
= N
|
| 44 |
-
= n
|
| 45 |
-
=C C
|
| 46 |
-
=C N
|
| 47 |
-
=C c
|
| 48 |
-
=c c
|
| 49 |
-
=N C
|
| 50 |
-
=N c
|
| 51 |
-
=n C
|
| 52 |
-
=n c
|
| 53 |
-
# N
|
| 54 |
-
# C
|
| 55 |
-
#N C
|
| 56 |
-
#C C
|
| 57 |
-
#C N
|
| 58 |
-
#N N
|
| 59 |
-
( C
|
| 60 |
-
C )
|
| 61 |
-
( O
|
| 62 |
-
O )
|
| 63 |
-
( N
|
| 64 |
-
N )
|
| 65 |
-
Br c
|
| 66 |
-
( =O
|
| 67 |
-
(=O )
|
| 68 |
-
C (=O)
|
| 69 |
-
C =O
|
| 70 |
-
C =N
|
| 71 |
-
C #N
|
| 72 |
-
C #C
|
| 73 |
-
C C
|
| 74 |
-
CC C
|
| 75 |
-
CC N
|
| 76 |
-
CC O
|
| 77 |
-
CC S
|
| 78 |
-
CC c
|
| 79 |
-
CC n
|
| 80 |
-
C N
|
| 81 |
-
CN C
|
| 82 |
-
CN c
|
| 83 |
-
C O
|
| 84 |
-
CO C
|
| 85 |
-
CO N
|
| 86 |
-
CO c
|
| 87 |
-
C S
|
| 88 |
-
CS C
|
| 89 |
-
CS S
|
| 90 |
-
CS c
|
| 91 |
-
C c
|
| 92 |
-
Cl c
|
| 93 |
-
C n
|
| 94 |
-
F c
|
| 95 |
-
N C
|
| 96 |
-
NC C
|
| 97 |
-
NC c
|
| 98 |
-
N N
|
| 99 |
-
N O
|
| 100 |
-
N c
|
| 101 |
-
N n
|
| 102 |
-
O C
|
| 103 |
-
OC C
|
| 104 |
-
OC O
|
| 105 |
-
OC c
|
| 106 |
-
O N
|
| 107 |
-
O O
|
| 108 |
-
O c
|
| 109 |
-
S C
|
| 110 |
-
SC C
|
| 111 |
-
SC c
|
| 112 |
-
S S
|
| 113 |
-
S c
|
| 114 |
-
c c
|
| 115 |
-
cc c
|
| 116 |
-
cc n
|
| 117 |
-
cc o
|
| 118 |
-
cc s
|
| 119 |
-
cc cc
|
| 120 |
-
c n
|
| 121 |
-
cn c
|
| 122 |
-
cn n
|
| 123 |
-
c o
|
| 124 |
-
co c
|
| 125 |
-
c s
|
| 126 |
-
cs c
|
| 127 |
-
cs n
|
| 128 |
-
n c
|
| 129 |
-
nc c
|
| 130 |
-
nc n
|
| 131 |
-
nc o
|
| 132 |
-
nc s
|
| 133 |
-
n n
|
| 134 |
-
nn c
|
| 135 |
-
nn n
|
| 136 |
-
n o
|
| 137 |
-
no c
|
| 138 |
-
no n
|
| 139 |
-
n s
|
| 140 |
-
ns c
|
| 141 |
-
ns n
|
| 142 |
-
o c
|
| 143 |
-
oc c
|
| 144 |
-
o n
|
| 145 |
-
s c
|
| 146 |
-
sc c
|
| 147 |
-
sc n
|
| 148 |
-
s n
|
| 149 |
-
N P
|
| 150 |
-
P N
|
| 151 |
-
C P
|
| 152 |
-
P C
|
| 153 |
-
N S
|
| 154 |
-
S N
|
| 155 |
-
C S
|
| 156 |
-
S C
|
| 157 |
-
S P
|
| 158 |
-
P S
|
| 159 |
-
C I
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
functions/tokenizer/new_vocab.txt
DELETED
|
@@ -1,586 +0,0 @@
|
|
| 1 |
-
[PAD]
|
| 2 |
-
[UNK]
|
| 3 |
-
[CLS]
|
| 4 |
-
[SEP]
|
| 5 |
-
[MASK]
|
| 6 |
-
#
|
| 7 |
-
%
|
| 8 |
-
(
|
| 9 |
-
)
|
| 10 |
-
+
|
| 11 |
-
-
|
| 12 |
-
/
|
| 13 |
-
0
|
| 14 |
-
1
|
| 15 |
-
2
|
| 16 |
-
3
|
| 17 |
-
4
|
| 18 |
-
5
|
| 19 |
-
6
|
| 20 |
-
7
|
| 21 |
-
8
|
| 22 |
-
9
|
| 23 |
-
=
|
| 24 |
-
@
|
| 25 |
-
A
|
| 26 |
-
B
|
| 27 |
-
Br
|
| 28 |
-
Brc
|
| 29 |
-
C
|
| 30 |
-
CC
|
| 31 |
-
CCC
|
| 32 |
-
CCN
|
| 33 |
-
CCO
|
| 34 |
-
CCS
|
| 35 |
-
CCc
|
| 36 |
-
CCn
|
| 37 |
-
CN
|
| 38 |
-
CNC
|
| 39 |
-
CNc
|
| 40 |
-
CO
|
| 41 |
-
COC
|
| 42 |
-
CON
|
| 43 |
-
COc
|
| 44 |
-
CS
|
| 45 |
-
CSC
|
| 46 |
-
CSS
|
| 47 |
-
CSc
|
| 48 |
-
Cc
|
| 49 |
-
Cl
|
| 50 |
-
Clc
|
| 51 |
-
Cn
|
| 52 |
-
F
|
| 53 |
-
Fc
|
| 54 |
-
H
|
| 55 |
-
I
|
| 56 |
-
K
|
| 57 |
-
L
|
| 58 |
-
M
|
| 59 |
-
N
|
| 60 |
-
NC
|
| 61 |
-
NCC
|
| 62 |
-
NCc
|
| 63 |
-
NN
|
| 64 |
-
NO
|
| 65 |
-
Nc
|
| 66 |
-
Nn
|
| 67 |
-
O
|
| 68 |
-
OC
|
| 69 |
-
OCC
|
| 70 |
-
OCO
|
| 71 |
-
OCc
|
| 72 |
-
ON
|
| 73 |
-
OO
|
| 74 |
-
Oc
|
| 75 |
-
P
|
| 76 |
-
R
|
| 77 |
-
S
|
| 78 |
-
SC
|
| 79 |
-
SCC
|
| 80 |
-
SCc
|
| 81 |
-
SS
|
| 82 |
-
Sc
|
| 83 |
-
T
|
| 84 |
-
X
|
| 85 |
-
Z
|
| 86 |
-
[
|
| 87 |
-
\\
|
| 88 |
-
(/
|
| 89 |
-
]
|
| 90 |
-
a
|
| 91 |
-
b
|
| 92 |
-
c
|
| 93 |
-
cc
|
| 94 |
-
ccc
|
| 95 |
-
ccn
|
| 96 |
-
cco
|
| 97 |
-
ccs
|
| 98 |
-
cn
|
| 99 |
-
cnc
|
| 100 |
-
cnn
|
| 101 |
-
co
|
| 102 |
-
coc
|
| 103 |
-
cs
|
| 104 |
-
csc
|
| 105 |
-
csn
|
| 106 |
-
e
|
| 107 |
-
g
|
| 108 |
-
i
|
| 109 |
-
l
|
| 110 |
-
n
|
| 111 |
-
nc
|
| 112 |
-
ncc
|
| 113 |
-
ncn
|
| 114 |
-
nco
|
| 115 |
-
ncs
|
| 116 |
-
nn
|
| 117 |
-
nnc
|
| 118 |
-
nnn
|
| 119 |
-
no
|
| 120 |
-
noc
|
| 121 |
-
non
|
| 122 |
-
ns
|
| 123 |
-
nsc
|
| 124 |
-
nsn
|
| 125 |
-
o
|
| 126 |
-
oc
|
| 127 |
-
occ
|
| 128 |
-
on
|
| 129 |
-
p
|
| 130 |
-
r
|
| 131 |
-
s
|
| 132 |
-
sc
|
| 133 |
-
scc
|
| 134 |
-
scn
|
| 135 |
-
sn
|
| 136 |
-
t
|
| 137 |
-
c1
|
| 138 |
-
c2
|
| 139 |
-
c3
|
| 140 |
-
c4
|
| 141 |
-
c5
|
| 142 |
-
c6
|
| 143 |
-
c7
|
| 144 |
-
c8
|
| 145 |
-
c9
|
| 146 |
-
n1
|
| 147 |
-
n2
|
| 148 |
-
n3
|
| 149 |
-
n4
|
| 150 |
-
n5
|
| 151 |
-
n6
|
| 152 |
-
n7
|
| 153 |
-
n8
|
| 154 |
-
n9
|
| 155 |
-
O1
|
| 156 |
-
O2
|
| 157 |
-
O3
|
| 158 |
-
O4
|
| 159 |
-
O5
|
| 160 |
-
O6
|
| 161 |
-
O7
|
| 162 |
-
O8
|
| 163 |
-
O9
|
| 164 |
-
(c1
|
| 165 |
-
(c2
|
| 166 |
-
c1)
|
| 167 |
-
c2)
|
| 168 |
-
(n1
|
| 169 |
-
(n2
|
| 170 |
-
n1)
|
| 171 |
-
n2)
|
| 172 |
-
(O1
|
| 173 |
-
(O2
|
| 174 |
-
O2)
|
| 175 |
-
=O
|
| 176 |
-
=C
|
| 177 |
-
=c
|
| 178 |
-
=N
|
| 179 |
-
=n
|
| 180 |
-
=CC
|
| 181 |
-
=CN
|
| 182 |
-
=Cc
|
| 183 |
-
=cc
|
| 184 |
-
=NC
|
| 185 |
-
=Nc
|
| 186 |
-
=nC
|
| 187 |
-
=nc
|
| 188 |
-
#C
|
| 189 |
-
#CC
|
| 190 |
-
#CN
|
| 191 |
-
#N
|
| 192 |
-
#NC
|
| 193 |
-
#NN
|
| 194 |
-
(C
|
| 195 |
-
C)
|
| 196 |
-
(O
|
| 197 |
-
O)
|
| 198 |
-
(N
|
| 199 |
-
N)
|
| 200 |
-
NP
|
| 201 |
-
PN
|
| 202 |
-
CP
|
| 203 |
-
PC
|
| 204 |
-
NS
|
| 205 |
-
SN
|
| 206 |
-
SP
|
| 207 |
-
PS
|
| 208 |
-
C(=O)
|
| 209 |
-
(/Br)
|
| 210 |
-
(/C#N)
|
| 211 |
-
(/C)
|
| 212 |
-
(/C=N)
|
| 213 |
-
(/C=O)
|
| 214 |
-
(/CBr)
|
| 215 |
-
(/CC)
|
| 216 |
-
(/CCC)
|
| 217 |
-
(/CCF)
|
| 218 |
-
(/CCN)
|
| 219 |
-
(/CCO)
|
| 220 |
-
(/CCl)
|
| 221 |
-
(/CI)
|
| 222 |
-
(/CN)
|
| 223 |
-
(/CO)
|
| 224 |
-
(/CS)
|
| 225 |
-
(/Cl)
|
| 226 |
-
(/F)
|
| 227 |
-
(/I)
|
| 228 |
-
(/N)
|
| 229 |
-
(/NC)
|
| 230 |
-
(/NCC)
|
| 231 |
-
(/NO)
|
| 232 |
-
(/O)
|
| 233 |
-
(/OC)
|
| 234 |
-
(/OCC)
|
| 235 |
-
(/S)
|
| 236 |
-
(/SC)
|
| 237 |
-
(=C)
|
| 238 |
-
(=C/C)
|
| 239 |
-
(=C/F)
|
| 240 |
-
(=C/I)
|
| 241 |
-
(=C/N)
|
| 242 |
-
(=C/O)
|
| 243 |
-
(=CBr)
|
| 244 |
-
(=CC)
|
| 245 |
-
(=CCF)
|
| 246 |
-
(=CCN)
|
| 247 |
-
(=CCO)
|
| 248 |
-
(=CCl)
|
| 249 |
-
(=CF)
|
| 250 |
-
(=CI)
|
| 251 |
-
(=CN)
|
| 252 |
-
(=CO)
|
| 253 |
-
(=C\\C)
|
| 254 |
-
(=C\\F)
|
| 255 |
-
(=C\\I)
|
| 256 |
-
(=C\\N)
|
| 257 |
-
(=C\\O)
|
| 258 |
-
(=N)
|
| 259 |
-
(=N/C)
|
| 260 |
-
(=N/N)
|
| 261 |
-
(=N/O)
|
| 262 |
-
(=NBr)
|
| 263 |
-
(=NC)
|
| 264 |
-
(=NCC)
|
| 265 |
-
(=NCl)
|
| 266 |
-
(=NN)
|
| 267 |
-
(=NO)
|
| 268 |
-
(=NOC)
|
| 269 |
-
(=N\\C)
|
| 270 |
-
(=N\\N)
|
| 271 |
-
(=N\\O)
|
| 272 |
-
(=O)
|
| 273 |
-
(=S)
|
| 274 |
-
(B)
|
| 275 |
-
(Br)
|
| 276 |
-
(C#C)
|
| 277 |
-
(C#CC)
|
| 278 |
-
(C#CI)
|
| 279 |
-
(C#CO)
|
| 280 |
-
(C#N)
|
| 281 |
-
(C#SN)
|
| 282 |
-
(C)
|
| 283 |
-
(C=C)
|
| 284 |
-
(C=CF)
|
| 285 |
-
(C=CI)
|
| 286 |
-
(C=N)
|
| 287 |
-
(C=NN)
|
| 288 |
-
(C=NO)
|
| 289 |
-
(C=O)
|
| 290 |
-
(C=S)
|
| 291 |
-
(CBr)
|
| 292 |
-
(CC#C)
|
| 293 |
-
(CC#N)
|
| 294 |
-
(CC)
|
| 295 |
-
(CC=C)
|
| 296 |
-
(CC=O)
|
| 297 |
-
(CCBr)
|
| 298 |
-
(CCC)
|
| 299 |
-
(CCCC)
|
| 300 |
-
(CCCF)
|
| 301 |
-
(CCCI)
|
| 302 |
-
(CCCN)
|
| 303 |
-
(CCCO)
|
| 304 |
-
(CCCS)
|
| 305 |
-
(CCCl)
|
| 306 |
-
(CCF)
|
| 307 |
-
(CCI)
|
| 308 |
-
(CCN)
|
| 309 |
-
(CCNC)
|
| 310 |
-
(CCNN)
|
| 311 |
-
(CCNO)
|
| 312 |
-
(CCO)
|
| 313 |
-
(CCOC)
|
| 314 |
-
(CCON)
|
| 315 |
-
(CCS)
|
| 316 |
-
(CCSC)
|
| 317 |
-
(CCl)
|
| 318 |
-
(CF)
|
| 319 |
-
(CI)
|
| 320 |
-
(CN)
|
| 321 |
-
(CN=O)
|
| 322 |
-
(CNC)
|
| 323 |
-
(CNCC)
|
| 324 |
-
(CNCO)
|
| 325 |
-
(CNN)
|
| 326 |
-
(CNNC)
|
| 327 |
-
(CNO)
|
| 328 |
-
(CNOC)
|
| 329 |
-
(CO)
|
| 330 |
-
(COC)
|
| 331 |
-
(COCC)
|
| 332 |
-
(COCI)
|
| 333 |
-
(COCN)
|
| 334 |
-
(COCO)
|
| 335 |
-
(COF)
|
| 336 |
-
(CON)
|
| 337 |
-
(COO)
|
| 338 |
-
(CS)
|
| 339 |
-
(CSC)
|
| 340 |
-
(CSCC)
|
| 341 |
-
(CSCF)
|
| 342 |
-
(CSO)
|
| 343 |
-
(Cl)
|
| 344 |
-
(F)
|
| 345 |
-
(I)
|
| 346 |
-
(N)
|
| 347 |
-
(N=N)
|
| 348 |
-
(N=NO)
|
| 349 |
-
(N=O)
|
| 350 |
-
(N=S)
|
| 351 |
-
(NBr)
|
| 352 |
-
(NC#N)
|
| 353 |
-
(NC)
|
| 354 |
-
(NC=N)
|
| 355 |
-
(NC=O)
|
| 356 |
-
(NC=S)
|
| 357 |
-
(NCBr)
|
| 358 |
-
(NCC)
|
| 359 |
-
(NCCC)
|
| 360 |
-
(NCCF)
|
| 361 |
-
(NCCN)
|
| 362 |
-
(NCCO)
|
| 363 |
-
(NCCS)
|
| 364 |
-
(NCCl)
|
| 365 |
-
(NCNC)
|
| 366 |
-
(NCO)
|
| 367 |
-
(NCS)
|
| 368 |
-
(NCl)
|
| 369 |
-
(NN)
|
| 370 |
-
(NN=O)
|
| 371 |
-
(NNC)
|
| 372 |
-
(NO)
|
| 373 |
-
(NOC)
|
| 374 |
-
(O)
|
| 375 |
-
(OC#N)
|
| 376 |
-
(OC)
|
| 377 |
-
(OC=C)
|
| 378 |
-
(OC=O)
|
| 379 |
-
(OC=S)
|
| 380 |
-
(OCBr)
|
| 381 |
-
(OCC)
|
| 382 |
-
(OCCC)
|
| 383 |
-
(OCCF)
|
| 384 |
-
(OCCI)
|
| 385 |
-
(OCCN)
|
| 386 |
-
(OCCO)
|
| 387 |
-
(OCCS)
|
| 388 |
-
(OCCl)
|
| 389 |
-
(OCF)
|
| 390 |
-
(OCI)
|
| 391 |
-
(OCO)
|
| 392 |
-
(OCOC)
|
| 393 |
-
(OCON)
|
| 394 |
-
(OCSC)
|
| 395 |
-
(OCl)
|
| 396 |
-
(OI)
|
| 397 |
-
(ON)
|
| 398 |
-
(OO)
|
| 399 |
-
(OOC)
|
| 400 |
-
(OOCC)
|
| 401 |
-
(OOSN)
|
| 402 |
-
(OSC)
|
| 403 |
-
(P)
|
| 404 |
-
(S)
|
| 405 |
-
(SC#N)
|
| 406 |
-
(SC)
|
| 407 |
-
(SCC)
|
| 408 |
-
(SCCC)
|
| 409 |
-
(SCCF)
|
| 410 |
-
(SCCN)
|
| 411 |
-
(SCCO)
|
| 412 |
-
(SCCS)
|
| 413 |
-
(SCCl)
|
| 414 |
-
(SCF)
|
| 415 |
-
(SCN)
|
| 416 |
-
(SCOC)
|
| 417 |
-
(SCSC)
|
| 418 |
-
(SCl)
|
| 419 |
-
(SI)
|
| 420 |
-
(SN)
|
| 421 |
-
(SN=O)
|
| 422 |
-
(SO)
|
| 423 |
-
(SOC)
|
| 424 |
-
(SOOO)
|
| 425 |
-
(SS)
|
| 426 |
-
(SSC)
|
| 427 |
-
(SSCC)
|
| 428 |
-
([At])
|
| 429 |
-
([O-])
|
| 430 |
-
([O])
|
| 431 |
-
([S-])
|
| 432 |
-
(\\Br)
|
| 433 |
-
(\\C#N)
|
| 434 |
-
(\\C)
|
| 435 |
-
(\\C=N)
|
| 436 |
-
(\\C=O)
|
| 437 |
-
(\\CBr)
|
| 438 |
-
(\\CC)
|
| 439 |
-
(\\CCC)
|
| 440 |
-
(\\CCO)
|
| 441 |
-
(\\CCl)
|
| 442 |
-
(\\CF)
|
| 443 |
-
(\\CN)
|
| 444 |
-
(\\CNC)
|
| 445 |
-
(\\CO)
|
| 446 |
-
(\\COC)
|
| 447 |
-
(\\Cl)
|
| 448 |
-
(\\F)
|
| 449 |
-
(\\I)
|
| 450 |
-
(\\N)
|
| 451 |
-
(\\NC)
|
| 452 |
-
(\\NCC)
|
| 453 |
-
(\\NN)
|
| 454 |
-
(\\NO)
|
| 455 |
-
(\\NOC)
|
| 456 |
-
(\\O)
|
| 457 |
-
(\\OC)
|
| 458 |
-
(\\OCC)
|
| 459 |
-
(\\ON)
|
| 460 |
-
(\\S)
|
| 461 |
-
(\\SC)
|
| 462 |
-
(\\SCC)
|
| 463 |
-
[Ag+]
|
| 464 |
-
[Ag-4]
|
| 465 |
-
[Ag]
|
| 466 |
-
[Al-3]
|
| 467 |
-
[Al]
|
| 468 |
-
[As+]
|
| 469 |
-
[AsH3]
|
| 470 |
-
[AsH]
|
| 471 |
-
[As]
|
| 472 |
-
[At]
|
| 473 |
-
[B-]
|
| 474 |
-
[B@-]
|
| 475 |
-
[B@@-]
|
| 476 |
-
[BH-]
|
| 477 |
-
[BH2-]
|
| 478 |
-
[BH3-]
|
| 479 |
-
[B]
|
| 480 |
-
[Ba]
|
| 481 |
-
[Br+2]
|
| 482 |
-
[BrH]
|
| 483 |
-
[Br]
|
| 484 |
-
[C+]
|
| 485 |
-
[C-]
|
| 486 |
-
[C@@H]
|
| 487 |
-
[C@@]
|
| 488 |
-
[C@H]
|
| 489 |
-
[C@]
|
| 490 |
-
[CH-]
|
| 491 |
-
[CH2]
|
| 492 |
-
[CH3]
|
| 493 |
-
[CH]
|
| 494 |
-
[C]
|
| 495 |
-
[CaH2]
|
| 496 |
-
[Ca]
|
| 497 |
-
[Cl+2]
|
| 498 |
-
[Cl+3]
|
| 499 |
-
[Cl+]
|
| 500 |
-
[Cs]
|
| 501 |
-
[FH]
|
| 502 |
-
[F]
|
| 503 |
-
[H]
|
| 504 |
-
[He]
|
| 505 |
-
[I+2]
|
| 506 |
-
[I+3]
|
| 507 |
-
[I+]
|
| 508 |
-
[IH]
|
| 509 |
-
[I]
|
| 510 |
-
[K]
|
| 511 |
-
[Kr]
|
| 512 |
-
[Li+]
|
| 513 |
-
[LiH]
|
| 514 |
-
[MgH2]
|
| 515 |
-
[Mg]
|
| 516 |
-
[N+]
|
| 517 |
-
[N-]
|
| 518 |
-
[N@+]
|
| 519 |
-
[N@@+]
|
| 520 |
-
[N@@]
|
| 521 |
-
[N@]
|
| 522 |
-
[NH+]
|
| 523 |
-
[NH-]
|
| 524 |
-
[NH2+]
|
| 525 |
-
[NH3]
|
| 526 |
-
[NH]
|
| 527 |
-
[N]
|
| 528 |
-
[Na]
|
| 529 |
-
[O+]
|
| 530 |
-
[O-]
|
| 531 |
-
[OH+]
|
| 532 |
-
[OH2]
|
| 533 |
-
[OH]
|
| 534 |
-
[O]
|
| 535 |
-
[P+]
|
| 536 |
-
[P@+]
|
| 537 |
-
[P@@+]
|
| 538 |
-
[P@@]
|
| 539 |
-
[P@]
|
| 540 |
-
[PH2]
|
| 541 |
-
[PH]
|
| 542 |
-
[P]
|
| 543 |
-
[Ra]
|
| 544 |
-
[Rb]
|
| 545 |
-
[S+]
|
| 546 |
-
[S-]
|
| 547 |
-
[S@+]
|
| 548 |
-
[S@@+]
|
| 549 |
-
[S@@]
|
| 550 |
-
[S@]
|
| 551 |
-
[SH+]
|
| 552 |
-
[SH2]
|
| 553 |
-
[SH]
|
| 554 |
-
[S]
|
| 555 |
-
[Se+]
|
| 556 |
-
[Se-2]
|
| 557 |
-
[SeH2]
|
| 558 |
-
[SeH]
|
| 559 |
-
[Se]
|
| 560 |
-
[Si@]
|
| 561 |
-
[SiH2]
|
| 562 |
-
[SiH]
|
| 563 |
-
[Si]
|
| 564 |
-
[SrH2]
|
| 565 |
-
[TeH]
|
| 566 |
-
[Te]
|
| 567 |
-
[Xe]
|
| 568 |
-
[Zn+2]
|
| 569 |
-
[Zn-2]
|
| 570 |
-
[Zn]
|
| 571 |
-
[b-]
|
| 572 |
-
[c+]
|
| 573 |
-
[c-]
|
| 574 |
-
[cH-]
|
| 575 |
-
[cH]
|
| 576 |
-
[c]
|
| 577 |
-
[n+]
|
| 578 |
-
[n-]
|
| 579 |
-
[nH]
|
| 580 |
-
[n]
|
| 581 |
-
[o+]
|
| 582 |
-
[s+]
|
| 583 |
-
[se+]
|
| 584 |
-
[se]
|
| 585 |
-
[te+]
|
| 586 |
-
[te]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
load.py → inference.py
RENAMED
|
@@ -48,16 +48,18 @@ def normalize_property_key(name: str) -> str:
|
|
| 48 |
n = name.strip().lower()
|
| 49 |
n = re.sub(r"\s*\(.*?\)\s*", "", n)
|
| 50 |
n = n.replace("-", "_").replace(" ", "_")
|
|
|
|
| 51 |
if "permeability" in n and "pampa" not in n and "caco" not in n:
|
| 52 |
return "permeability_penetrance"
|
| 53 |
if n == "binding_affinity":
|
| 54 |
return "binding_affinity"
|
| 55 |
-
if n
|
| 56 |
-
return "
|
| 57 |
if n == "non_fouling":
|
| 58 |
return "nf"
|
| 59 |
return n
|
| 60 |
|
|
|
|
| 61 |
def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
|
| 62 |
"""
|
| 63 |
Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
|
|
@@ -111,7 +113,8 @@ MODEL_ALIAS = {
|
|
| 111 |
"XGB": "xgb",
|
| 112 |
"XGB_REG": "xgb_reg",
|
| 113 |
"POOLED": "pooled",
|
| 114 |
-
"UNPOOLED": "unpooled"
|
|
|
|
| 115 |
}
|
| 116 |
def canon_model(label: Optional[str]) -> Optional[str]:
|
| 117 |
if label is None:
|
|
@@ -235,8 +238,25 @@ def build_torch_model_from_ckpt(model_name: str, ckpt: dict, device: torch.devic
|
|
| 235 |
model = CNNHead(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
|
| 236 |
layers=int(params["layers"]), dropout=dropout)
|
| 237 |
elif model_name == "transformer":
|
| 238 |
-
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 240 |
else:
|
| 241 |
raise ValueError(f"Unknown NN model_name={model_name}")
|
| 242 |
|
|
@@ -648,13 +668,21 @@ class PeptiVersePredictor:
|
|
| 648 |
self._load_all_best_models()
|
| 649 |
|
| 650 |
def _resolve_dir(self, prop_key: str, model_name: str, mode: str) -> Path:
|
| 651 |
-
|
| 652 |
-
|
| 653 |
-
|
| 654 |
-
|
| 655 |
-
|
| 656 |
-
"""
|
| 657 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 658 |
candidates = [
|
| 659 |
base / f"{model_name}_{mode}",
|
| 660 |
base / model_name,
|
|
@@ -667,7 +695,11 @@ class PeptiVersePredictor:
|
|
| 667 |
for d in candidates:
|
| 668 |
if d.exists():
|
| 669 |
return d
|
| 670 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
|
| 672 |
def _load_all_best_models(self):
|
| 673 |
for prop_key, row in self.manifest.items():
|
|
@@ -705,15 +737,24 @@ class PeptiVersePredictor:
|
|
| 705 |
self.models[(prop_key, mode)] = obj
|
| 706 |
else:
|
| 707 |
# rebuild NN architecture
|
| 708 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 709 |
|
| 710 |
self.meta[(prop_key, mode)] = {
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
| 716 |
-
|
|
|
|
| 717 |
|
| 718 |
def _get_features_for_model(self, prop_key: str, mode: str, input_str: str):
|
| 719 |
"""
|
|
@@ -769,6 +810,14 @@ class PeptiVersePredictor:
|
|
| 769 |
X, M = self._get_features_for_model(prop_key, mode, input_str)
|
| 770 |
with torch.no_grad():
|
| 771 |
y = model(X, M).squeeze().float().cpu().item()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
if task_type == "classifier":
|
| 773 |
prob = float(1.0 / (1.0 + np.exp(-y))) # sigmoid(logit)
|
| 774 |
out = {"property": prop_key, "mode": mode, "score": prob}
|
|
@@ -779,15 +828,22 @@ class PeptiVersePredictor:
|
|
| 779 |
else:
|
| 780 |
return {"property": prop_key, "mode": mode, "score": float(y)}
|
| 781 |
|
| 782 |
-
# xgb path
|
| 783 |
if kind == "xgb":
|
| 784 |
-
feats = self._get_features_for_model(prop_key, mode, input_str)
|
| 785 |
dmat = xgb.DMatrix(feats)
|
| 786 |
pred = float(model.predict(dmat)[0])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
out = {"property": prop_key, "mode": mode, "score": pred}
|
| 788 |
-
|
| 789 |
-
out["label"] = int(pred >= float(thr))
|
| 790 |
-
out["threshold"] = float(thr)
|
| 791 |
return out
|
| 792 |
|
| 793 |
# joblib path (svm/enet/svr)
|
|
@@ -850,7 +906,7 @@ class PeptiVersePredictor:
|
|
| 850 |
cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
|
| 851 |
cls_thr = affinity_to_class(affinity)
|
| 852 |
|
| 853 |
-
names = {0: "High (≥9)", 1: "Moderate (7
|
| 854 |
return {
|
| 855 |
"property": "binding_affinity",
|
| 856 |
"mode": mode,
|
|
@@ -861,14 +917,10 @@ class PeptiVersePredictor:
|
|
| 861 |
}
|
| 862 |
|
| 863 |
|
| 864 |
-
# -----------------------------
|
| 865 |
-
# Minimal usage
|
| 866 |
-
# -----------------------------
|
| 867 |
if __name__ == "__main__":
|
| 868 |
-
# Example:
|
| 869 |
predictor = PeptiVersePredictor(
|
| 870 |
manifest_path="best_models.txt",
|
| 871 |
-
classifier_weight_root="
|
| 872 |
)
|
| 873 |
print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))
|
| 874 |
print(predictor.predict_binding_affinity("wt", target_seq="...", binder_str="..."))
|
|
@@ -879,8 +931,8 @@ if __name__ == "__main__":
|
|
| 879 |
|
| 880 |
wt = WTEmbedder(device)
|
| 881 |
sm = SMILESEmbedder(device,
|
| 882 |
-
vocab_path="/
|
| 883 |
-
splits_path="
|
| 884 |
)
|
| 885 |
|
| 886 |
p = wt.pooled("GIGAVLKVLTTGLPALISWIKRKRQQ") # (1,1280)
|
|
|
|
| 48 |
n = name.strip().lower()
|
| 49 |
n = re.sub(r"\s*\(.*?\)\s*", "", n)
|
| 50 |
n = n.replace("-", "_").replace(" ", "_")
|
| 51 |
+
|
| 52 |
if "permeability" in n and "pampa" not in n and "caco" not in n:
|
| 53 |
return "permeability_penetrance"
|
| 54 |
if n == "binding_affinity":
|
| 55 |
return "binding_affinity"
|
| 56 |
+
if n in {"halflife", "half_life"}:
|
| 57 |
+
return "halflife"
|
| 58 |
if n == "non_fouling":
|
| 59 |
return "nf"
|
| 60 |
return n
|
| 61 |
|
| 62 |
+
|
| 63 |
def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
|
| 64 |
"""
|
| 65 |
Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
|
|
|
|
| 113 |
"XGB": "xgb",
|
| 114 |
"XGB_REG": "xgb_reg",
|
| 115 |
"POOLED": "pooled",
|
| 116 |
+
"UNPOOLED": "unpooled",
|
| 117 |
+
"TRANSFORMER_WT_LOG": "transformer_wt_log",
|
| 118 |
}
|
| 119 |
def canon_model(label: Optional[str]) -> Optional[str]:
|
| 120 |
if label is None:
|
|
|
|
| 238 |
model = CNNHead(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
|
| 239 |
layers=int(params["layers"]), dropout=dropout)
|
| 240 |
elif model_name == "transformer":
|
| 241 |
+
d_model = (
|
| 242 |
+
params.get("d_model")
|
| 243 |
+
or params.get("hidden")
|
| 244 |
+
or params.get("hidden_dim")
|
| 245 |
+
)
|
| 246 |
+
if d_model is None:
|
| 247 |
+
raise KeyError(
|
| 248 |
+
f"Transformer checkpoint missing d_model/hidden. "
|
| 249 |
+
f"Available keys: {list(params.keys())}"
|
| 250 |
+
)
|
| 251 |
+
|
| 252 |
+
model = TransformerHead(
|
| 253 |
+
in_dim=in_dim,
|
| 254 |
+
d_model=int(d_model),
|
| 255 |
+
nhead=int(params["nhead"]),
|
| 256 |
+
layers=int(params["layers"]),
|
| 257 |
+
ff=int(params.get("ff", 4 * int(d_model))),
|
| 258 |
+
dropout=dropout
|
| 259 |
+
)
|
| 260 |
else:
|
| 261 |
raise ValueError(f"Unknown NN model_name={model_name}")
|
| 262 |
|
|
|
|
| 668 |
self._load_all_best_models()
|
| 669 |
|
| 670 |
def _resolve_dir(self, prop_key: str, model_name: str, mode: str) -> Path:
|
| 671 |
+
# map halflife -> half_life folder on disk (common layout)
|
| 672 |
+
disk_prop = "half_life" if prop_key == "halflife" else prop_key
|
| 673 |
+
base = self.training_root / disk_prop
|
| 674 |
+
|
| 675 |
+
# special handling for halflife xgb_wt_log / xgb_smiles
|
| 676 |
+
if prop_key == "halflife" and model_name in {"xgb_wt_log", "xgb_smiles"}:
|
| 677 |
+
d = base / model_name
|
| 678 |
+
if d.exists():
|
| 679 |
+
return d
|
| 680 |
+
|
| 681 |
+
if prop_key == "halflife" and model_name == "xgb":
|
| 682 |
+
d = base / ("xgb_wt_log" if mode == "wt" else "xgb_smiles")
|
| 683 |
+
if d.exists():
|
| 684 |
+
return d
|
| 685 |
+
|
| 686 |
candidates = [
|
| 687 |
base / f"{model_name}_{mode}",
|
| 688 |
base / model_name,
|
|
|
|
| 695 |
for d in candidates:
|
| 696 |
if d.exists():
|
| 697 |
return d
|
| 698 |
+
|
| 699 |
+
raise FileNotFoundError(
|
| 700 |
+
f"Cannot find model directory for {prop_key} {model_name} {mode}. Tried: {candidates}"
|
| 701 |
+
)
|
| 702 |
+
|
| 703 |
|
| 704 |
def _load_all_best_models(self):
|
| 705 |
for prop_key, row in self.manifest.items():
|
|
|
|
| 737 |
self.models[(prop_key, mode)] = obj
|
| 738 |
else:
|
| 739 |
# rebuild NN architecture
|
| 740 |
+
arch = m
|
| 741 |
+
if arch.startswith("transformer"):
|
| 742 |
+
arch = "transformer"
|
| 743 |
+
elif arch.startswith("mlp"):
|
| 744 |
+
arch = "mlp"
|
| 745 |
+
elif arch.startswith("cnn"):
|
| 746 |
+
arch = "cnn"
|
| 747 |
+
|
| 748 |
+
self.models[(prop_key, mode)] = build_torch_model_from_ckpt(arch, obj, self.device)
|
| 749 |
|
| 750 |
self.meta[(prop_key, mode)] = {
|
| 751 |
+
"task_type": row.task_type,
|
| 752 |
+
"threshold": thr,
|
| 753 |
+
"artifact": str(art),
|
| 754 |
+
"model_name": m,
|
| 755 |
+
"kind": kind,
|
| 756 |
+
}
|
| 757 |
+
|
| 758 |
|
| 759 |
def _get_features_for_model(self, prop_key: str, mode: str, input_str: str):
|
| 760 |
"""
|
|
|
|
| 810 |
X, M = self._get_features_for_model(prop_key, mode, input_str)
|
| 811 |
with torch.no_grad():
|
| 812 |
y = model(X, M).squeeze().float().cpu().item()
|
| 813 |
+
# invert log1p(hours) ONLY for WT half-life log models
|
| 814 |
+
model_name = meta.get("model_name", "")
|
| 815 |
+
if (
|
| 816 |
+
prop_key == "halflife"
|
| 817 |
+
and mode == "wt"
|
| 818 |
+
and model_name in {"xgb_wt_log", "transformer_wt_log"}
|
| 819 |
+
):
|
| 820 |
+
y = float(np.expm1(y))
|
| 821 |
if task_type == "classifier":
|
| 822 |
prob = float(1.0 / (1.0 + np.exp(-y))) # sigmoid(logit)
|
| 823 |
out = {"property": prop_key, "mode": mode, "score": prob}
|
|
|
|
| 828 |
else:
|
| 829 |
return {"property": prop_key, "mode": mode, "score": float(y)}
|
| 830 |
|
|
|
|
| 831 |
if kind == "xgb":
|
| 832 |
+
feats = self._get_features_for_model(prop_key, mode, input_str)
|
| 833 |
dmat = xgb.DMatrix(feats)
|
| 834 |
pred = float(model.predict(dmat)[0])
|
| 835 |
+
|
| 836 |
+
# invert log1p(hours) ONLY for WT half-life log models
|
| 837 |
+
model_name = meta.get("model_name", "")
|
| 838 |
+
if (
|
| 839 |
+
prop_key == "halflife"
|
| 840 |
+
and mode == "wt"
|
| 841 |
+
and model_name in {"xgb_wt_log", "transformer_wt_log"}
|
| 842 |
+
):
|
| 843 |
+
pred = float(np.expm1(pred))
|
| 844 |
+
|
| 845 |
out = {"property": prop_key, "mode": mode, "score": pred}
|
| 846 |
+
|
|
|
|
|
|
|
| 847 |
return out
|
| 848 |
|
| 849 |
# joblib path (svm/enet/svr)
|
|
|
|
| 906 |
cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
|
| 907 |
cls_thr = affinity_to_class(affinity)
|
| 908 |
|
| 909 |
+
names = {0: "High (≥9)", 1: "Moderate (7-9)", 2: "Low (<7)"}
|
| 910 |
return {
|
| 911 |
"property": "binding_affinity",
|
| 912 |
"mode": mode,
|
|
|
|
| 917 |
}
|
| 918 |
|
| 919 |
|
|
|
|
|
|
|
|
|
|
| 920 |
if __name__ == "__main__":
|
|
|
|
| 921 |
predictor = PeptiVersePredictor(
|
| 922 |
manifest_path="best_models.txt",
|
| 923 |
+
classifier_weight_root="./Classifier_Weight"
|
| 924 |
)
|
| 925 |
print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))
|
| 926 |
print(predictor.predict_binding_affinity("wt", target_seq="...", binder_str="..."))
|
|
|
|
| 931 |
|
| 932 |
wt = WTEmbedder(device)
|
| 933 |
sm = SMILESEmbedder(device,
|
| 934 |
+
vocab_path="./tokeizner/new_vocab.txt",
|
| 935 |
+
splits_path="./tokenizer/new_splits.txt"
|
| 936 |
)
|
| 937 |
|
| 938 |
p = wt.pooled("GIGAVLKVLTTGLPALISWIKRKRQQ") # (1,1280)
|
models/best_model_half_life.pth
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f80f1b20e90ba30503804c738aad4b3bb253424ff2e6e8a86c8e13a2fa1669f9
|
| 3 |
-
size 2623795199
|
|
|
|
|
|
|
|
|
|
|
|
models/best_model_hemolysis.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:09b90730272d48f061bf41c79a5ae44f5c977f331d48600c8615852806308be1
|
| 3 |
-
size 1938117
|
|
|
|
|
|
|
|
|
|
|
|
models/best_model_nonfouling.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:bc50e344298c5db6f45ba65b09c61f80ee47c8b4b33f7193b068618520c948d1
|
| 3 |
-
size 2275245
|
|
|
|
|
|
|
|
|
|
|
|
models/best_model_solubility.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:948ac245c158aacd51f36cc71f4ee7bbbf3568c92666c637689790a01677fa59
|
| 3 |
-
size 3698748
|
|
|
|
|
|
|
|
|
|
|
|
models/binding_affinity_smiles.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:04986ccd078abd8f744d299b5e73c1e93bf4899896bb8d8f0e2bacbe0e8c6c97
|
| 3 |
-
size 132487302
|
|
|
|
|
|
|
|
|
|
|
|
models/binding_affinity_unpooled.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:fc28ae9f09b981b07547a773ca2e07f241cb08b3b8aa901e66627ff153f3aa8b
|
| 3 |
-
size 2731670995
|
|
|
|
|
|
|
|
|
|
|
|
models/enhancer_class.ckpt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:0cdbd02bc600847aa238967f00bc66882e515d3385e9eb278c1fa85818625492
|
| 3 |
-
size 37598951
|
|
|
|
|
|
|
|
|
|
|
|
models/enhancer_class_hparams.yaml
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:579a2065f9976e9de5f4d90976973c049a33f135268961b76ebe6e38fd986450
|
| 3 |
-
size 1814
|
|
|
|
|
|
|
|
|
|
|
|
models/hemolysis-xgboost_smiles.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d5a3c91f1bd746d6e7eda091147b555e760a5b2585423ce8d75990837e781b51
|
| 3 |
-
size 288201
|
|
|
|
|
|
|
|
|
|
|
|
models/nonfouling-xgboost_smiles.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:269fe5d2a90e52a075949c9de281db5b65f97ea386369c687edb1656b8686381
|
| 3 |
-
size 165817
|
|
|
|
|
|
|
|
|
|
|
|
models/permeability-xgboost_smiles.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:4020f779e0d273fecc7a0dbe3dd43c40bbf028f76559c0f8d687a6da5a715267
|
| 3 |
-
size 6343274
|
|
|
|
|
|
|
|
|
|
|
|
models/solubility-xgboost_smiles.json
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:49395cb4c55a08c9b9683c11133fbde3ef05cf81f4a28d3303c8cf8c06c55597
|
| 3 |
-
size 279803
|
|
|
|
|
|
|
|
|
|
|
|
scoring_functions.py
DELETED
|
@@ -1,103 +0,0 @@
|
|
| 1 |
-
import sys
|
| 2 |
-
import io
|
| 3 |
-
import subprocess
|
| 4 |
-
import warnings
|
| 5 |
-
import numpy as np
|
| 6 |
-
import pandas as pd
|
| 7 |
-
from typing import List
|
| 8 |
-
from loguru import logger
|
| 9 |
-
from tqdm import tqdm
|
| 10 |
-
from rdkit import Chem, rdBase, DataStructs
|
| 11 |
-
from rdkit.Chem import AllChem
|
| 12 |
-
import torch
|
| 13 |
-
from functions.binding.binding import BindingAffinity
|
| 14 |
-
from functions.permeability.permeability import Permeability
|
| 15 |
-
from functions.solubility.solubility import Solubility
|
| 16 |
-
from functions.hemolysis.hemolysis import Hemolysis
|
| 17 |
-
from functions.nonfouling.nonfouling import Nonfouling
|
| 18 |
-
|
| 19 |
-
class ScoringFunctions:
|
| 20 |
-
def __init__(self, score_func_names=None, prot_seqs=[]):
|
| 21 |
-
"""
|
| 22 |
-
Class for generating score vectors given generated sequence
|
| 23 |
-
|
| 24 |
-
Args:
|
| 25 |
-
score_func_names: list of scoring function names to be evaluated
|
| 26 |
-
score_weights: weights to scale scores (default: 1)
|
| 27 |
-
target_protein: sequence of target protein binder
|
| 28 |
-
"""
|
| 29 |
-
if score_func_names is None:
|
| 30 |
-
# just do unmasking based on validity of peptide bonds
|
| 31 |
-
self.score_func_names = []
|
| 32 |
-
else:
|
| 33 |
-
self.score_func_names = score_func_names
|
| 34 |
-
|
| 35 |
-
# self.weights = np.array([1] * len(self.score_func_names) if score_weights is None else score_weights)
|
| 36 |
-
|
| 37 |
-
# binding affinities
|
| 38 |
-
self.target_protein = prot_seqs
|
| 39 |
-
print(len(prot_seqs))
|
| 40 |
-
|
| 41 |
-
if ('binding_affinity1' in score_func_names) and (len(prot_seqs) == 1):
|
| 42 |
-
binding_affinity1 = BindingAffinity(prot_seqs[0])
|
| 43 |
-
binding_affinity2 = None
|
| 44 |
-
elif ('binding_affinity1' in score_func_names) and ('binding_affinity2' in score_func_names) and (len(prot_seqs) == 2):
|
| 45 |
-
binding_affinity1 = BindingAffinity(prot_seqs[0])
|
| 46 |
-
binding_affinity2 = BindingAffinity(prot_seqs[1])
|
| 47 |
-
else:
|
| 48 |
-
print("here")
|
| 49 |
-
binding_affinity1 = None
|
| 50 |
-
binding_affinity2 = None
|
| 51 |
-
|
| 52 |
-
permeability = Permeability()
|
| 53 |
-
sol = Solubility()
|
| 54 |
-
nonfouling = Nonfouling()
|
| 55 |
-
hemo = Hemolysis()
|
| 56 |
-
|
| 57 |
-
self.all_funcs = {'binding_affinity1': binding_affinity1,
|
| 58 |
-
'binding_affinity2': binding_affinity2,
|
| 59 |
-
'permeability': permeability,
|
| 60 |
-
'nonfouling': nonfouling,
|
| 61 |
-
'solubility': sol,
|
| 62 |
-
'hemolysis': hemo
|
| 63 |
-
}
|
| 64 |
-
|
| 65 |
-
def forward(self, input_seqs):
|
| 66 |
-
scores = []
|
| 67 |
-
|
| 68 |
-
for i, score_func in enumerate(self.score_func_names):
|
| 69 |
-
score = self.all_funcs[score_func](input_seqs = input_seqs)
|
| 70 |
-
|
| 71 |
-
scores.append(score)
|
| 72 |
-
|
| 73 |
-
# convert to numpy arrays with shape (num_sequences, num_functions)
|
| 74 |
-
scores = np.float32(scores).T
|
| 75 |
-
|
| 76 |
-
return scores
|
| 77 |
-
|
| 78 |
-
def __call__(self, input_seqs: list):
|
| 79 |
-
return self.forward(input_seqs)
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
def unittest():
|
| 83 |
-
amhr = 'MLGSLGLWALLPTAVEAPPNRRTCVFFEAPGVRGSTKTLGELLDTGTELPRAIRCLYSRCCFGIWNLTQDRAQVEMQGCRDSDEPGCESLHCDPSPRAHPSPGSTLFTCSCGTDFCNANYSHLPPPGSPGTPGSQGPQAAPGESIWMALVLLGLFLLLLLLLGSIILALLQRKNYRVRGEPVPEPRPDSGRDWSVELQELPELCFSQVIREGGHAVVWAGQLQGKLVAIKAFPPRSVAQFQAERALYELPGLQHDHIVRFITASRGGPGRLLSGPLLVLELHPKGSLCHYLTQYTSDWGSSLRMALSLAQGLAFLHEERWQNGQYKPGIAHRDLSSQNVLIREDGSCAIGDLGLALVLPGLTQPPAWTPTQPQGPAAIMEAGTQRYMAPELLDKTLDLQDWGMALRRADIYSLALLLWEILSRCPDLRPDSSPPPFQLAYEAELGNTPTSDELWALAVQERRRPYIPSTWRCFATDPDGLRELLEDCWDADPEARLTAECVQQRLAALAHPQESHPFPESCPRGCPPLCPEDCTSIPAPTILPCRPQRSACHFSVQQGPCSRNPQPACTLSPV'
|
| 84 |
-
tfr = 'MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKANVTKPKRCSGSICYGTIAVIVFFLIGFMIGYLGYCKGVEPKTECERLAGTESPVREEPGEDFPAARRLYWDDLKRKLSEKLDSTDFTGTIKLLNENSYVPREAGSQKDENLALYVENQFREFKLSKVWRDQHFVKIQVKDSAQNSVIIVDKNGRLVYLVENPGGYVAYSKAATVTGKLVHANFGTKKDFEDLYTPVNGSIVIVRAGKITFAEKVANAESLNAIGVLIYMDQTKFPIVNAELSFFGHAHLGTGDPYTPGFPSFNHTQFPPSRSSGLPNIPVQTISRAAAEKLFGNMEGDCPSDWKTDSTCRMVTSESKNVKLTVSNVLKEIKILNIFGVIKGFVEPDHYVVVGAQRDAWGPGAAKSGVGTALLLKLAQMFSDMVLKDGFQPSRSIIFASWSAGDFGSVGATEWLEGYLSSLHLKAFTYINLDKAVLGTSNFKVSASPLLYTLIEKTMQNVKHPVTGQFLYQDSNWASKVEKLTLDNAAFPFLAYSGIPAVSFCFCEDTDYPYLGTTMDTYKELIERIPELNKVARAAAEVAGQFVIKLTHDVELNLDYERYNSQLLSFVRDLNQYRADIKEMGLSLQWLYSARGDFFRATSRLTTDFGNAEKTDRFVMKKLNDRVMRVEYHFLSPYVSPKESPFRHVFWGSGSHTLPALLENLKLRKQNNGAFNETLFRNQLALATWTIQGAANALSGDVWDIDNEF'
|
| 85 |
-
gfap = 'MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPLPTRVDFSLAGALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEPTKLADVYQAELRELRLRLDQLTANSARLEVERDNLAQDLATVRQKLQDETNLRLEAENNLAAYRQEADEATLARLDLERKIESLEEEIRFLRKIHEEEVRELQEQLARQQVHVELDVAKPDLTAALKEIRTQYEAMASSNMHEAEEWYRSKFADLTDAAARNAELLRQAKHEANDYRRQLQSLTCDLESLRGTNESLERQMREQEERHVREAASYQEALARLEEEGQSLKDEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEENRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKESKQEHKDVM'
|
| 86 |
-
glp1 = 'MAGAPGPLRLALLLLGMVGRAGPRPQGATVSLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLQKDNSSLPWRDLSECEESKRGERSSPEEQLLFLYIIYTVGYALSFSALVIASAILLGFRHLHCTRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQQHQWDGLLSYQDSLSCRLVFLLMQYCVAANYYWLLVEGVYLYTLLAFSVLSEQWIFRLYVSIGWGVPLLFVVPWGIVKYLYEDEGCWTRNSNMNYWLIIRLPILFAIGVNFLIFVRVICIVVSKLKANLMCKTDIKCRLAKSTLTLIPLLGTHEVIFAFVMDEHARGTLRFIKLFTELSFTSFQGLMVAILYCFVNNEVQLEFRKSWERWRLEHLHIQRDSSMKPLKCPTSSLSSGATAGSSMYTATCQASCS'
|
| 87 |
-
glast = 'MTKSNGEEPKMGGRMERFQQGVRKRTLLAKKKVQNITKEDVKSYLFRNAFVLLTVTAVIVGTILGFTLRPYRMSYREVKYFSFPGELLMRMLQMLVLPLIISSLVTGMAALDSKASGKMGMRAVVYYMTTTIIAVVIGIIIVIIIHPGKGTKENMHREGKIVRVTAADAFLDLIRNMFPPNLVEACFKQFKTNYEKRSFKVPIQANETLVGAVINNVSEAMETLTRITEELVPVPGSVNGVNALGLVVFSMCFGFVIGNMKEQGQALREFFDSLNEAIMRLVAVIMWYAPVGILFLIAGKIVEMEDMGVIGGQLAMYTVTVIVGLLIHAVIVLPLLYFLVTRKNPWVFIGGLLQALITALGTSSSSATLPITFKCLEENNGVDKRVTRFVLPVGATINMDGTALYEALAAIFIAQVNNFELNFGQIITISITATAASIGAAGIPQAGLVTMVIVLTSVGLPTDDITLIIAVDWFLDRLRTTTNVLGDSLGAGIVEHLSRHELKNRDVEMGNSVIEENEMKKPYQLIAQDNETEKPIDSETKM'
|
| 88 |
-
ncam = 'LQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEF'
|
| 89 |
-
cereblon = 'MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNIINFDTSLPTSHTYLGADMEEFHGRTLHDDDSCQVIPVLPQVMMILIPGQTLPLQLFHPQEVSMVRNLIQKDRTFAVLAYSNVQEREAQFGTTAEIYAYREEQDFGIEIVKVKAIGRQRFKVLELRTQSDGIQQAKVQILPECVLPSTMSAVQLESLNKCQIFPSKPVSREDQCSYKWWQKYQKRKFHCANLTSWPRWLYSLYDAETLMDRIKKQLREWDENLKDDSLPSNPIDFSYRVAACLPIDDVLRIQLLKIGSAIQRLRCELDIMNKCTSLCCKQCQETEITTKNEIFSLSLCGPMAAYVNPHGYVHETLTVYKACNLNLIGRPSTEHSWFPGYAWTVAQCKICASHIGWKFTATKKDMSPQKFWGLTRSALLPTIPDTEDEISPDKVILCL'
|
| 90 |
-
|
| 91 |
-
num_iter = 0
|
| 92 |
-
score_func_times = [0, 1, 2, 3, 4, 5]
|
| 93 |
-
|
| 94 |
-
scoring = ScoringFunctions(score_func_names=['binding_affinity1', 'solubility', 'hemolysis', 'nonfouling', 'permeability'], prot_seqs=[tfr])
|
| 95 |
-
|
| 96 |
-
smiles = ['N2[C@H](CC(C)C)C(=O)N1[C@@H](CCC1)C(=O)N1[C@@H](CCC1)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](Cc1ccccc1C(F)(F)F)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(=O)N)C2(=O)']
|
| 97 |
-
|
| 98 |
-
scores = scoring(input_seqs=smiles)
|
| 99 |
-
print(scores)
|
| 100 |
-
print(len(scores))
|
| 101 |
-
|
| 102 |
-
if __name__ == '__main__':
|
| 103 |
-
unittest()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|