ynuozhang commited on Jan 4

Commit

a98d518

1 Parent(s): cdf1251

remove unnecessary

Files changed (25) hide show

.gitattributes +8 -1
README.md +449 -3
functions/binding.py +0 -186
functions/hemolysis.py +0 -69
functions/nonfouling.py +0 -69
functions/permeability.py +0 -167
functions/solubility.py +0 -68
functions/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc +0 -0
functions/tokenizer/my_tokenizers.py +0 -398
functions/tokenizer/new_splits.txt +0 -159
functions/tokenizer/new_vocab.txt +0 -586
load.py → inference.py +85 -33
models/best_model_half_life.pth +0 -3
models/best_model_hemolysis.json +0 -3
models/best_model_nonfouling.json +0 -3
models/best_model_solubility.json +0 -3
models/binding_affinity_smiles.pt +0 -3
models/binding_affinity_unpooled.pt +0 -3
models/enhancer_class.ckpt +0 -3
models/enhancer_class_hparams.yaml +0 -3
models/hemolysis-xgboost_smiles.json +0 -3
models/nonfouling-xgboost_smiles.json +0 -3
models/permeability-xgboost_smiles.json +0 -3
models/solubility-xgboost_smiles.json +0 -3
scoring_functions.py +0 -103

.gitattributes CHANGED Viewed

@@ -85,7 +85,14 @@ metrics/solubility/train_classification_plot.png filter=lfs diff=lfs merge=lfs -
 metrics filter=lfs diff=lfs merge=lfs -text
 models filter=lfs diff=lfs merge=lfs -text
 training_data filter=lfs diff=lfs merge=lfs -text
-README.md filter=lfs diff=lfs merge=lfs -text
 embeddings filter=lfs diff=lfs merge=lfs -text
 models/binding_affinity_for_smiles.pt filter=lfs diff=lfs merge=lfs -text
 *.csv filter=lfs diff=lfs merge=lfs -text

 metrics filter=lfs diff=lfs merge=lfs -text
 models filter=lfs diff=lfs merge=lfs -text
 training_data filter=lfs diff=lfs merge=lfs -text
 embeddings filter=lfs diff=lfs merge=lfs -text
 models/binding_affinity_for_smiles.pt filter=lfs diff=lfs merge=lfs -text
 *.csv filter=lfs diff=lfs merge=lfs -text
+training_classifiers/half_life/xgb_wt_log/best_model.json filter=lfs diff=lfs merge=lfs -text
+training_classifiers/half_life/xgb_wt_raw/best_model.json filter=lfs diff=lfs merge=lfs -text
+training_data_cleaned/toxicity/*.arrow filter=lfs diff=lfs merge=lfs -text
+training_data_cleaned/toxicity/tox_smiles_with_embeddings/*.arrow filter=lfs diff=lfs merge=lfs -text
+training_data_cleaned/toxicity/tox_smiles_with_embeddings/train/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text
+training_data_cleaned/toxicity/tox_smiles_with_embeddings_unpooled/train/data-00000-of-00005.arrow filter=lfs diff=lfs merge=lfs -text
+training_data_cleaned/toxicity/tox_smiles_with_embeddings_unpooled/val/*.arrow filter=lfs diff=lfs merge=lfs -text
+training_data_cleaned/toxicity/tox_smiles_with_embeddings/val/data-00000-of-00001.arrow filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,3 +1,449 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0dd39f30b6311602a8b9533d532405c3f5427d7b61179f993d29b10f95627017
-size 18784

+---
+license: cc-by-nc-nd-4.0
+---
+![Untitled design (3)](https://cdn-uploads.huggingface.co/production/uploads/64cd5b3f0494187a9e8b7c69/bpOe1xggl9lw90JMi3VsC.png)
+# PeptiVerse 🧬🌌
+A collection of machine learning predictors for canonical and non-canonical peptide property prediction using sequence and SMILES representations. 🧬 PeptiVerse 🌌 enables evaluation of key biophysical and therapeutic properties of peptides for property-optimized generation.
+## Table of Contents
+- [Quick start](#quick-start-)
+- [Installation](#installation-)
+- [Repository Structure](#repository-structure-)
+- [Training data collection](#training-data-collection-)
+- [Best model list](#best-model-list-)
+   - [Full model set (cuML-enabled)](#full-model-set-gpu-enabled)
+   - [Minimal deployable model set (no cuML)](#minimal-deployable-set)
+- [Usage](#usage-)
+   - [Local Application Hosting](#local-application-hosting)
+   - [Dataset integration](#dataset-integration)
+   - [Quick inference by property per model](#Quick-inference-by-property-per-model)
+- [Property Interpretations](#property-interpretations-)
+- [Model Architecture](#model-architecture-)
+- [Troubleshooting](#troubleshooting-)
+- [Citation](#citation-)
+## Quick start 🌟
+```bash
+# Clone repository
+git clone https://huggingface.co/ChatterjeeLab/PeptiVerse
+# Install dependencies
+pip install -r requirements.txt
+# Run inference
+python inference.py
+```
+## Installation 🌟
+### Minimal Setup 🚀
+- Easy start-up environment (using transformers, xgboost models)
+```bash
+pip install -r requirements.txt
+```
+### Full Setup 🚀
+- Additional access to trained SVM and ElastNet models requires installation of `RAPIDS cuML`, with instructions available from their official [github page](https://github.com/rapidsai/cuml) (**CUDA-capable GPU required**).
+- Optional: pre-compiled Singularity/Apptainer environment (7.52G) is available at [Google drive](https://drive.google.com/file/d/1RJQ9HK0_gsPOhRo5H5ZmH_MYcpJqQD7e/view?usp=sharing) with everything you need (still need CUDA/GPU to load cuML models).
+    ```
+    # test
+    apptainer exec peptiverse.sif python -c "import sys; print(sys.executable)"
+    # run inference (see below)
+    apptainer exec peptiverse.sif python inference.py
+    ```
+## Repository structure 🌟
+This repo contains important large files for [PeptiVerse](https://huggingface.co/spaces/ChatterjeeLab/PeptiVerse), an interactive app for peptide property prediction. [Paper link.](https://www.biorxiv.org/content/10.64898/2025.12.31.697180v1)
+```
+PeptiVerse/
+├── training_data_cleaned/     # Processed datasets with embeddings
+│   └── <property>/            # Property-specific data
+│       ├── train/val splits
+│       └── precomputed embeddings
+├── training_classifiers/      # Trained model weights
+│   └── <property>/
+│       ├── cnn_wt/           # CNN architectures
+│       ├── mlp_wt/           # MLP architectures
+│       └── xgb_wt/           # XGBoost models
+├── tokenizer/                 # PeptideCLM tokenizer
+├── training_data/             # Raw training data
+├── inference.py               # Main prediction interface
+├── best_models.txt            # Model selection manifest
+└── requirements.txt           # Python dependencies
+```
+## Training Data Collection 🌟
+<table>
+  <caption><strong>Data distribution.</strong> Classification tasks report counts for class 0/1; regression tasks report total sample size (N).</caption>
+  <thead>
+    <tr>
+      <th rowspan="2"><strong>Properties</strong></th>
+      <th colspan="2"><strong>Amino Acid Sequences</strong></th>
+      <th colspan="2"><strong>SMILES Sequences</strong></th>
+    </tr>
+    <tr>
+      <th><strong>0</strong></th>
+      <th><strong>1</strong></th>
+      <th><strong>0</strong></th>
+      <th><strong>1</strong></th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td colspan="5"><strong>Classification</strong></td>
+    </tr>
+    <tr>
+      <td>Hemolysis</td>
+      <td>4765</td>
+      <td>1311</td>
+      <td>4765</td>
+      <td>1311</td>
+    </tr>
+    <tr>
+      <td>Non-Fouling</td>
+      <td>13580</td>
+      <td>3600</td>
+      <td>13580</td>
+      <td>3600</td>
+    </tr>
+    <tr>
+      <td>Solubility</td>
+      <td>9668</td>
+      <td>8785</td>
+      <td>-</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>Permeability (Penetrance)</td>
+      <td>1162</td>
+      <td>1162</td>
+      <td>-</td>
+      <td>-</td>
+    </tr>
+    <tr>
+      <td>Toxicity</td>
+      <td>-</td>
+      <td>-</td>
+      <td>5518</td>
+      <td>5518</td>
+    </tr>
+    <tr>
+      <td colspan="5"><strong>Regression (N)</strong></td>
+    </tr>
+    <tr>
+      <td>Permeability (PAMPA)</td>
+      <td colspan="2" align="center">-</td>
+      <td colspan="2" align="center">6869</td>
+    </tr>
+    <tr>
+      <td>Permeability (CACO2)</td>
+      <td colspan="2" align="center">-</td>
+      <td colspan="2" align="center">606</td>
+    </tr>
+    <tr>
+      <td>Half-Life</td>
+      <td colspan="2" align="center">130</td>
+      <td colspan="2" align="center">245</td>
+    </tr>
+    <tr>
+      <td>Binding Affinity</td>
+      <td colspan="2" align="center">1436</td>
+      <td colspan="2" align="center">1597</td>
+    </tr>
+  </tbody>
+</table>
+## Best Model List 🌟
+### Full model set (cuML-enabled)
+| Property                    | Best Model (Sequence) | Best Model (SMILES) | Task Type   | Threshold (Sequence) | Threshold (SMILES) |
+|----------------------------|-----------------|---------------------|-------------|----------------|--------------------|
+| Hemolysis                  | SVM             | Transformer         | Classifier  | 0.2521         | 0.4343             |
+| Non-Fouling                | MLP             | ENET                | Classifier  | 0.57           | 0.6969             |
+| Solubility                 | CNN             | –                   | Classifier  | 0.377          | –                  |
+| Permeability (Penetrance)  | SVM             | –                   | Classifier  | 0.5493         | –                  |
+| Toxicity                   | –               | Transformer         | Classifier  | –              | 0.3401             |
+| Binding Affinity           | unpooled        | unpooled            | Regression  | –              | –                  |
+| Permeability (PAMPA)       | –               | CNN                 | Regression  | –              | –                  |
+| Permeability (Caco-2)      | –               | SVR                 | Regression  | –              | –                  |
+| Half-life                  | Transformer     | XGB                 | Regression  | –              | –                  |
+>Note: *unpooled* indicates models operating on token-level embeddings with cross-attention, rather than mean-pooled representations.
+### Minimal deployable model set (no cuML)
+| Property                    | Best Model (WT) | Best Model (SMILES) | Task Type   | Threshold (WT) | Threshold (SMILES) |
+|----------------------------|-----------------|---------------------|-------------|----------------|--------------------|
+| Hemolysis                  | XGB             | Transformer         | Classifier  | 0.2801         | 0.4343             |
+| Non-Fouling                | MLP             | XGB                 | Classifier  | 0.57           | 0.3982             |
+| Solubility                 | CNN             | –                   | Classifier  | 0.377          | –                  |
+| Permeability (Penetrance)  | XGB             | –                   | Classifier  | 0.4301         | –                  |
+| Toxicity                   | –               | Transformer         | Classifier  | –              | 0.3401             |
+| Binding Affinity           | unpooled        | unpooled            | Regression  | –              | –                  |
+| Permeability (PAMPA)       | –               | CNN                 | Regression  | –              | –                  |
+| Permeability (Caco-2)      | –               | SVR                 | Regression  | –              | –                  |
+| Half-life                  | xgb_wt_log      | xgb_smiles          | Regression  | –              | –                  |
+>Note: Models marked as SVM or ENET are replaced with XGB as these models are not currently supported in the deployment environment without cuML setups. *xgb_wt_log* indicated log-scaled transformation of time during training.
+## Usage 🌟
+### Local Application Hosting
+- Host the [PeptiVerse UI](https://huggingface.co/spaces/ChatterjeeLab/PeptiVerse) locally with your own resources.
+```bash
+# Configure models in best_models.txt
+git clone https://huggingface.co/spaces/ChatterjeeLab/PeptiVerse
+python app.py
+```
+### Dataset integration
+- All properties are provided with raw_data/split_ready_csvs/[huggingface_datasets](https://huggingface.co/docs/datasets/en/index).
+- Selective download the data you need with `huggingface-cli`
+```bash
+huggingface-cli download ChatterjeeLab/PeptiVerse \
+  --include "training_data_cleaned/**" \     # only this folder
+  --exclude "**/*.pt" "**/*.joblib" \     # skip weights/artifacts
+  --local-dir PeptiVerse_partial \
+  --local-dir-use-symlinks False      # make real copies
+```
+- Or in python
+```python
+from huggingface_hub import snapshot_download
+local_dir = snapshot_download(
+    repo_id="ChatterjeeLab/PeptiVerse",
+    allow_patterns=["training_data_cleaned/**"],     # only this folder
+    ignore_patterns=["**/*.pt", "**/*.joblib"],     # skip weights/artifacts
+    local_dir="PeptiVerse_partial",
+    local_dir_use_symlinks=False,                   # make real copies
+)
+print("Downloaded to:", local_dir)
+```
+- Usage of the huggingface datasets (with pre-computed embeddings and splits)
+    - All embedding datasets are saved via `DatasetDict.save_to_disk` and loadable with:
+    ``` python
+    from datasets import load_from_disk
+    ds = load_from_disk(PATH)
+    train_ds = ds["train"]
+    val_ds = ds["val"]
+    ```
+- A) Sequence Based ([ESM-2](https://huggingface.co/facebook/esm2_t33_650M_UR50D) embeddings)
+    - Pooled (fixed-length vector per sequence)
+        - Generated by mean-pooling token embeddings excluding special tokens (CLS/EOS) and padding.
+        - Each item:
+            sequence: `str`
+            label: `int` (classification) or `float` (regression)
+            embedding: `float32[H]` (H=1280 for ESM-2 650M)
+    - Unpooled (variable-length token matrix)
+        - Generated by keeping all valid token embeddings (excluding special tokens + padding) as a per-sequence matrix.
+        - Each item:
+            sequence: `str`
+            label: `int` (classification) or `float` (regression)
+            embedding: `float16[L, H]` (nested lists)
+            attention_mask: `int8[L]`
+            length: `int` (=L)
+- B) SMILES-based ([PeptideCLM](https://github.com/AaronFeller/PeptideCLM) embeddings)
+    - Pooled (fixed-length vector per sequence)
+        - Generated by mean-pooling token embeddings excluding special tokens (CLS/EOS) and padding.
+        - Each item:
+            sequence: `str` (SMILES)
+            label: `int` (classification) or `float` (regression)
+            embedding: `float32[H]`
+    - Unpooled (variable-length token matrix)
+        - Generated by keeping all valid token embeddings (excluding special tokens + padding) as a per-sequence matrix.
+        - Each item:
+            sequence: `str` (SMILES)
+            label: `int` (classification) or `float` (regression)
+            embedding: `float16[L, H]` (nested lists)
+            attention_mask: `int8[L]`
+            length: `int` (=L)
+### Quick inference by property per model
+```python
+from inference import PeptiVersePredictor
+pred = PeptiVersePredictor(
+    manifest_path="best_models.txt",          # best model list
+    classifier_weight_root=".",               # repo root (where training_classifiers/ lives)
+    device="cuda",                            # or "cpu"
+)
+# mode: smiles (SMILES-based models) / wt (Sequence-based models)
+# property keys (with some level of name normalization)
+# hemolysis
+# nf (Non-Fouling)
+# solubility
+# permeability_penetrance
+# toxicity
+# permeability_pampa
+# permeability_caco2
+# halflife
+# binding_affinity
+seq = "GIVEQCCTSICSLYQLENYCN"
+smiles = "CC(C)C[C@@H]1NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@@H](C)N(C)C(=O)[C@H](Cc2ccccc2)NC(=O)[C@H](CC(C)C)N(C)C(=O)[C@H]2CCCN2C1=O"
+# Hemolysis
+out = pred.predict_property("hemolysis", mode="wt", input_str=seq)
+print(out)
+# {"property":"hemolysis","mode":"wt","score":prob,"label":0/1,"threshold":...}
+out = pred.predict_property("hemolysis", mode="smiles", input_str=smiles)
+print(out)
+# Non-fouling (key is nf)
+out = pred.predict_property("nf", mode="wt", input_str=seq)
+print(out)
+out = pred.predict_property("nf", mode="smiles", input_str=smiles)
+print(out)
+# Solubility (Sequence-only)
+out = pred.predict_property("solubility", mode="wt", input_str=seq)
+print(out)
+# Permeability (Penetrance) (Sequence-only)
+out = pred.predict_property("permeability_penetrance", mode="wt", input_str=seq)
+print(out)
+# Toxicity (SMILES-only)
+out = pred.predict_property("toxicity", mode="smiles", input_str=smiles)
+print(out)
+# Permeability (PAMPA) (SMILES regression)
+out = pred.predict_property("permeability_pampa", mode="smiles", input_str=smiles)
+print(out)
+# {"property":"permeability_pampa","mode":"smiles","score":value}
+# Permeability (Caco-2) (SMILES regression)
+out = pred.predict_property("permeability_caco2", mode="smiles", input_str=smiles)
+print(out)
+# Half-life (sequence-based + SMILES regression)
+out = pred.predict_property("halflife", mode="wt", input_str=seq)
+print(out)
+out = pred.predict_property("halflife", mode="smiles", input_str=smiles)
+print(out)
+# Binding Affinity
+protein = "MTEYKLVVVGAGGVGKSALTIQLIQNHFVDEYDPTIEDSYRKQV..."  # target protein
+peptide_seq = "GIVEQCCTSICSLYQLENYCN"
+out = pred.predict_binding_affinity(
+    mode="wt",
+    target_seq=protein,
+    binder_str=peptide_seq,
+)
+print(out)
+# {
+#   "property":"binding_affinity",
+#   "mode":"wt",
+#   "affinity": float,
+#   "class_by_threshold": "High (≥9)" / "Moderate (7-9)" / "Low (<7)",
+#   "class_by_logits": same buckets,
+#   "binding_model": "pooled" or "unpooled",
+# }
+```
+## Interpretation 🌟
+You can also find the same description in the paper or in the PeptiVerse app `Documentation` tab.
+---
+#### 🩸 Hemolysis Prediction
+50% of read blood cells being lysed at x ug/ml concetration (HC50). If HC50 < 100uM, considered as hemolytic, otherwise non-hemolytic, resulting in a binary 0/1 dataset. The predicted probability should therefore be interpreted as a risk indicator, not an exact concentration estimate.
+**Output interpretation:**
+- Score close to 1.0 = high probability of red blood cell membrane disruption
+- Score close to 0.0 = non-hemolytic
+---
+#### 💧 Solubility Prediction
+Outputs a probability (0–1) that a peptide remains soluble in aqueous conditions.
+**Output interpretation:**
+- Score close to 1.0 = highly soluble
+- Score close to 0.0 = poorly soluble
+---
+#### 👯 Non-Fouling Prediction
+Higher scores indicate stronger non-fouling behavior, desirable for circulation and surface-exposed applications.
+**Output interpretation:**
+- Score close to 1.0 = non-fouling
+- Score close to 0.0 = fouling
+---
+#### 🪣 Permeability Prediction
+Predicts membrane permeability on a log P scale.
+**Output interpretation:**
+- Higher values = more permeable (>-6.0)
+- For penetrance predictions, it is a classification prediction, so within the [0, 1] range, closer to 1 indicates more permeable.
+---
+#### ⏱️ Half-Life Prediction
+**Interpretation:** Predicted values reflect relative peptide stability for the unit in hours. Higher scores indicate longer persistence in serum, while lower scores suggest faster degradation.
+---
+#### ☠️ Toxicity Prediction
+**Interpretation:** Outputs a probability (0–1) that a peptide exhibits toxic effects. Higher scores indicate increased toxicity risk.
+---
+#### 🔗 Binding Affinity Prediction
+Predicts peptide-protein binding affinity. Requires both peptide and target protein sequence.
+**Interpretation:**<br>
+    - Scores ≥ 9 correspond to tight binders (K ≤ 10⁻⁹ M, nanomolar to picomolar range)<br>
+    - Scores between 7 and 9 correspond to medium binders (10⁻⁷–10⁻⁹ M, nanomolar to micromolar range)<br>
+    - Scores < 7 correspond to weak binders (K ≥ 10⁻⁶ M, micromolar and weaker)<br>
+    - A difference of 1 unit in score corresponds to an approximately tenfold change in binding affinity.<br>
+## Model Architecture 🌟
+- **Sequence Embeddings:** [ESM-2 650M model](https://huggingface.co/facebook/esm2_t33_650M_UR50D) / [PeptideCLM model](https://huggingface.co/aaronfeller/PeptideCLM-23M-all). Foundational embeddings are frozen.
+- **XGBoost Model:** Gradient boosting on pooled embedding features for efficient, high-performance prediction.
+- **CNN/Transformer Model:** One-dimensional convolutional/self-attention transformer networks operating on unpooled embeddings to capture local sequence patterns.
+- **Binding Model:** Transformer-based architecture with cross-attention between protein and peptide representations.
+- **SVR Model:** Support Vector Regression applied to pooled embeddings, providing a kernel-based, nonparametric regression baseline that is robust on smaller or noisy datasets.
+- **Others:** SVM and Elastic Nets were trained with [RAPIDS cuML](https://github.com/rapidsai/cuml), which requires a CUDA environment and is therefore not supported in the web app. Model checkpoints remain available in the Hugging Face repository.
+## Troubleshooting 🌟
+### LFS Download Issues
+If files appear as SHA pointers:
+```bash
+huggingface-cli download ChatterjeeLab/PeptiVerse \
+    training_data_cleaned/hemolysis/hemo_smiles_meta_with_split.csv \
+    --local-dir . \
+    --local-dir-use-symlinks False
+```
+## Citation 🌟
+If you find this repository helpful for your publications, please consider citing our paper:
+```
+@article {Zhang2025.12.31.697180,
+	author = {Zhang, Yinuo and Tang, Sophia and Chen, Tong and Mahood, Elizabeth and Vincoff, Sophia and Chatterjee, Pranam},
+	title = {PeptiVerse: A Unified Platform for Therapeutic Peptide Property Prediction},
+	elocation-id = {2025.12.31.697180},
+	year = {2026},
+	doi = {10.64898/2025.12.31.697180},
+	publisher = {Cold Spring Harbor Laboratory},
+	URL = {https://www.biorxiv.org/content/early/2026/01/03/2025.12.31.697180},
+	eprint = {https://www.biorxiv.org/content/early/2026/01/03/2025.12.31.697180.full.pdf},
+	journal = {bioRxiv}
+}
+```
+To use this repository, you agree to abide by the MIT License.

functions/binding.py DELETED Viewed

@@ -1,186 +0,0 @@
-import torch
-import pandas as pd
-import torch.nn as nn
-import esm
-from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
-from transformers import AutoModelForMaskedLM, AutoModelForCausalLM, AutoTokenizer, AutoModel
-base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
-class ImprovedBindingPredictor(nn.Module):
-    def __init__(self,
-                 esm_dim=1280,
-                 smiles_dim=768,
-                 hidden_dim=512,
-                 n_heads=8,
-                 n_layers=3,
-                 dropout=0.1):
-        super().__init__()
-        # Define binding thresholds
-        self.tight_threshold = 7.5    # Kd/Ki/IC50 ≤ ~30nM
-        self.weak_threshold = 6.0     # Kd/Ki/IC50 > 1μM
-        # Project to same dimension
-        self.smiles_projection = nn.Linear(smiles_dim, hidden_dim)
-        self.protein_projection = nn.Linear(esm_dim, hidden_dim)
-        self.protein_norm = nn.LayerNorm(hidden_dim)
-        self.smiles_norm = nn.LayerNorm(hidden_dim)
-        # Cross attention blocks with layer norm
-        self.cross_attention_layers = nn.ModuleList([
-            nn.ModuleDict({
-                'attention': nn.MultiheadAttention(hidden_dim, n_heads, dropout=dropout),
-                'norm1': nn.LayerNorm(hidden_dim),
-                'ffn': nn.Sequential(
-                    nn.Linear(hidden_dim, hidden_dim * 4),
-                    nn.ReLU(),
-                    nn.Dropout(dropout),
-                    nn.Linear(hidden_dim * 4, hidden_dim)
-                ),
-                'norm2': nn.LayerNorm(hidden_dim)
-            }) for _ in range(n_layers)
-        ])
-        # Prediction heads
-        self.shared_head = nn.Sequential(
-            nn.Linear(hidden_dim * 2, hidden_dim),
-            nn.ReLU(),
-            nn.Dropout(dropout),
-        )
-        # Regression head
-        self.regression_head = nn.Linear(hidden_dim, 1)
-        # Classification head (3 classes: tight, medium, loose binding)
-        self.classification_head = nn.Linear(hidden_dim, 3)
-    def get_binding_class(self, affinity):
-        """Convert affinity values to class indices
-        0: tight binding (>= 7.5)
-        1: medium binding (6.0-7.5)
-        2: weak binding (< 6.0)
-        """
-        if isinstance(affinity, torch.Tensor):
-            tight_mask = affinity >= self.tight_threshold
-            weak_mask = affinity < self.weak_threshold
-            medium_mask = ~(tight_mask | weak_mask)
-            classes = torch.zeros_like(affinity, dtype=torch.long)
-            classes[medium_mask] = 1
-            classes[weak_mask] = 2
-            return classes
-        else:
-            if affinity >= self.tight_threshold:
-                return 0  # tight binding
-            elif affinity < self.weak_threshold:
-                return 2  # weak binding
-            else:
-                return 1  # medium binding
-    def forward(self, protein_emb, smiles_emb):
-        protein = self.protein_norm(self.protein_projection(protein_emb))
-        smiles = self.smiles_norm(self.smiles_projection(smiles_emb))
-        #protein = protein.transpose(0, 1)
-        #smiles = smiles.transpose(0, 1)
-        # Cross attention layers
-        for layer in self.cross_attention_layers:
-            # Protein attending to SMILES
-            attended_protein = layer['attention'](
-                protein, smiles, smiles
-            )[0]
-            protein = layer['norm1'](protein + attended_protein)
-            protein = layer['norm2'](protein + layer['ffn'](protein))
-            # SMILES attending to protein
-            attended_smiles = layer['attention'](
-                smiles, protein, protein
-            )[0]
-            smiles = layer['norm1'](smiles + attended_smiles)
-            smiles = layer['norm2'](smiles + layer['ffn'](smiles))
-        # Get sequence-level representations
-        protein_pool = torch.mean(protein, dim=0)
-        smiles_pool = torch.mean(smiles, dim=0)
-        # Concatenate both representations
-        combined = torch.cat([protein_pool, smiles_pool], dim=-1)
-        # Shared features
-        shared_features = self.shared_head(combined)
-        regression_output = self.regression_head(shared_features)
-        classification_logits = self.classification_head(shared_features)
-        return regression_output, classification_logits
-class BindingAffinity:
-    def __init__(self, prot_seq, model_type='PeptideCLM'):
-        super().__init__()
-        # peptide embeddings
-        self.pep_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
-        self.pep_tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
-                                                    f'{base_path}/functions/tokenizer/new_splits.txt')
-        self.model = ImprovedBindingPredictor()
-        checkpoint = torch.load(f'{base_path}/src/binding/best_model.pt', weights_only=False)
-        self.model.load_state_dict(checkpoint['model_state_dict'])
-        self.model.eval()
-        self.esm_model, alphabet = esm.pretrained.esm2_t33_650M_UR50D()  # load ESM-2 model
-        self.prot_tokenizer = alphabet.get_batch_converter() # load esm tokenizer
-        data = [("target", prot_seq)]
-        # get tokenized protein
-        _, _, prot_tokens = self.prot_tokenizer(data)
-        with torch.no_grad():
-            results = self.esm_model.forward(prot_tokens, repr_layers=[33])  # Example with ESM-2
-            prot_emb = results["representations"][33]
-        self.prot_emb = prot_emb[0]
-        self.prot_emb = torch.mean(self.prot_emb, dim=0, keepdim=True)
-    def forward(self, input_seqs):
-        with torch.no_grad():
-            scores = []
-            for seq in input_seqs:
-                pep_tokens = self.pep_tokenizer(seq, return_tensors='pt', padding=True)
-                with torch.no_grad():
-                    emb = self.pep_model(input_ids=pep_tokens['input_ids'],
-                                         attention_mask=pep_tokens['attention_mask'],
-                                         output_hidden_states=True)
-                #emb = self.pep_model(input_ids=pep_tokens['input_ids'], attention_mask=pep_tokens['attention_mask'])
-                pep_emb = emb.last_hidden_state.squeeze(0)
-                pep_emb = torch.mean(pep_emb, dim=0, keepdim=True)
-                score, logits = self.model.forward(self.prot_emb, pep_emb)
-                scores.append(score.item())
-        return scores
-    def __call__(self, input_seqs: list):
-        return self.forward(input_seqs)
-def unittest():
-    amhr = 'MLGSLGLWALLPTAVEAPPNRRTCVFFEAPGVRGSTKTLGELLDTGTELPRAIRCLYSRCCFGIWNLTQDRAQVEMQGCRDSDEPGCESLHCDPSPRAHPSPGSTLFTCSCGTDFCNANYSHLPPPGSPGTPGSQGPQAAPGESIWMALVLLGLFLLLLLLLGSIILALLQRKNYRVRGEPVPEPRPDSGRDWSVELQELPELCFSQVIREGGHAVVWAGQLQGKLVAIKAFPPRSVAQFQAERALYELPGLQHDHIVRFITASRGGPGRLLSGPLLVLELHPKGSLCHYLTQYTSDWGSSLRMALSLAQGLAFLHEERWQNGQYKPGIAHRDLSSQNVLIREDGSCAIGDLGLALVLPGLTQPPAWTPTQPQGPAAIMEAGTQRYMAPELLDKTLDLQDWGMALRRADIYSLALLLWEILSRCPDLRPDSSPPPFQLAYEAELGNTPTSDELWALAVQERRRPYIPSTWRCFATDPDGLRELLEDCWDADPEARLTAECVQQRLAALAHPQESHPFPESCPRGCPPLCPEDCTSIPAPTILPCRPQRSACHFSVQQGPCSRNPQPACTLSPV'
-    tfr = 'MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKANVTKPKRCSGSICYGTIAVIVFFLIGFMIGYLGYCKGVEPKTECERLAGTESPVREEPGEDFPAARRLYWDDLKRKLSEKLDSTDFTGTIKLLNENSYVPREAGSQKDENLALYVENQFREFKLSKVWRDQHFVKIQVKDSAQNSVIIVDKNGRLVYLVENPGGYVAYSKAATVTGKLVHANFGTKKDFEDLYTPVNGSIVIVRAGKITFAEKVANAESLNAIGVLIYMDQTKFPIVNAELSFFGHAHLGTGDPYTPGFPSFNHTQFPPSRSSGLPNIPVQTISRAAAEKLFGNMEGDCPSDWKTDSTCRMVTSESKNVKLTVSNVLKEIKILNIFGVIKGFVEPDHYVVVGAQRDAWGPGAAKSGVGTALLLKLAQMFSDMVLKDGFQPSRSIIFASWSAGDFGSVGATEWLEGYLSSLHLKAFTYINLDKAVLGTSNFKVSASPLLYTLIEKTMQNVKHPVTGQFLYQDSNWASKVEKLTLDNAAFPFLAYSGIPAVSFCFCEDTDYPYLGTTMDTYKELIERIPELNKVARAAAEVAGQFVIKLTHDVELNLDYERYNSQLLSFVRDLNQYRADIKEMGLSLQWLYSARGDFFRATSRLTTDFGNAEKTDRFVMKKLNDRVMRVEYHFLSPYVSPKESPFRHVFWGSGSHTLPALLENLKLRKQNNGAFNETLFRNQLALATWTIQGAANALSGDVWDIDNEF'
-    gfap = 'MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPLPTRVDFSLAGALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEPTKLADVYQAELRELRLRLDQLTANSARLEVERDNLAQDLATVRQKLQDETNLRLEAENNLAAYRQEADEATLARLDLERKIESLEEEIRFLRKIHEEEVRELQEQLARQQVHVELDVAKPDLTAALKEIRTQYEAMASSNMHEAEEWYRSKFADLTDAAARNAELLRQAKHEANDYRRQLQSLTCDLESLRGTNESLERQMREQEERHVREAASYQEALARLEEEGQSLKDEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEENRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKESKQEHKDVM'
-    glp1 = 'MAGAPGPLRLALLLLGMVGRAGPRPQGATVSLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLQKDNSSLPWRDLSECEESKRGERSSPEEQLLFLYIIYTVGYALSFSALVIASAILLGFRHLHCTRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQQHQWDGLLSYQDSLSCRLVFLLMQYCVAANYYWLLVEGVYLYTLLAFSVLSEQWIFRLYVSIGWGVPLLFVVPWGIVKYLYEDEGCWTRNSNMNYWLIIRLPILFAIGVNFLIFVRVICIVVSKLKANLMCKTDIKCRLAKSTLTLIPLLGTHEVIFAFVMDEHARGTLRFIKLFTELSFTSFQGLMVAILYCFVNNEVQLEFRKSWERWRLEHLHIQRDSSMKPLKCPTSSLSSGATAGSSMYTATCQASCS'
-    glast = 'MTKSNGEEPKMGGRMERFQQGVRKRTLLAKKKVQNITKEDVKSYLFRNAFVLLTVTAVIVGTILGFTLRPYRMSYREVKYFSFPGELLMRMLQMLVLPLIISSLVTGMAALDSKASGKMGMRAVVYYMTTTIIAVVIGIIIVIIIHPGKGTKENMHREGKIVRVTAADAFLDLIRNMFPPNLVEACFKQFKTNYEKRSFKVPIQANETLVGAVINNVSEAMETLTRITEELVPVPGSVNGVNALGLVVFSMCFGFVIGNMKEQGQALREFFDSLNEAIMRLVAVIMWYAPVGILFLIAGKIVEMEDMGVIGGQLAMYTVTVIVGLLIHAVIVLPLLYFLVTRKNPWVFIGGLLQALITALGTSSSSATLPITFKCLEENNGVDKRVTRFVLPVGATINMDGTALYEALAAIFIAQVNNFELNFGQIITISITATAASIGAAGIPQAGLVTMVIVLTSVGLPTDDITLIIAVDWFLDRLRTTTNVLGDSLGAGIVEHLSRHELKNRDVEMGNSVIEENEMKKPYQLIAQDNETEKPIDSETKM'
-    ncam = 'LQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEF'
-    binding = BindingAffinity(tfr)
-    seq = ["CC[C@H](C)[C@H](NC(=O)[C@H](C)NC(=O)[C@@H](N)Cc1c[nH]cn1)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N1CCC[C@H]1C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1c[nH]cn1)C(=O)O"]
-    scores = binding(seq)
-    print(scores)
-if __name__ == '__main__':
-    unittest()

functions/hemolysis.py DELETED Viewed

@@ -1,69 +0,0 @@
-import sys
-import os
-import xgboost as xgb
-import torch
-import numpy as np
-from transformers import AutoModelForMaskedLM
-from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
-import warnings
-import numpy as np
-from rdkit.Chem import Descriptors, rdMolDescriptors
-from rdkit import Chem, rdBase, DataStructs
-from rdkit.Chem import AllChem
-from typing import List
-rdBase.DisableLog('rdApp.error')
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
-class Hemolysis:
-    def __init__(self):
-        self.predictor = xgb.Booster(model_file=f'{base_path}/src/best_model_f1.json')
-        self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
-        self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
-                                              f'{base_path}/functions/tokenizer/new_splits.txt')
-    def generate_embeddings(self, sequences):
-        embeddings = []
-        for sequence in sequences:
-            tokenized = self.tokenizer(sequence, return_tensors='pt')
-            with torch.no_grad():
-                output = self.emb_model(**tokenized)
-            # Mean pooling across sequence length
-            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
-            embeddings.append(embedding)
-        return np.array(embeddings)
-    def get_scores(self, input_seqs: list):
-        scores = np.ones(len(input_seqs))
-        features = self.generate_embeddings(input_seqs)
-        if len(features) == 0:
-            return scores
-        features = np.nan_to_num(features, nan=0.)
-        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
-        features = xgb.DMatrix(features)
-        probs = self.predictor.predict(features)
-        # return the probability of it being not hemolytic
-        return scores - probs
-    def __call__(self, input_seqs: list):
-        scores = self.get_scores(input_seqs)
-        return scores
-def unittest():
-    hemo = Hemolysis()
-    seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
-    scores = hemo(input_seqs=seq)
-    print(scores)
-if __name__ == '__main__':
-    unittest()

functions/nonfouling.py DELETED Viewed

@@ -1,69 +0,0 @@
-import sys
-import os
-import xgboost as xgb
-import torch
-import numpy as np
-from transformers import AutoModelForMaskedLM
-from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
-import warnings
-import numpy as np
-from rdkit import Chem, rdBase, DataStructs
-from transformers import AutoModelForMaskedLM
-rdBase.DisableLog('rdApp.error')
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
-class Nonfouling:
-    def __init__(self):
-        self.predictor = xgb.Booster(model_file=f'{base_path}/src/nonfouling/best_model_f1.json')
-        self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
-        self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
-                                              f'{base_path}/functions/tokenizer/new_splits.txt')
-    def generate_embeddings(self, sequences):
-        embeddings = []
-        for sequence in sequences:
-            tokenized = self.tokenizer(sequence, return_tensors='pt')
-            with torch.no_grad():
-                output = self.emb_model(**tokenized)
-            # Mean pooling across sequence length
-            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
-            embeddings.append(embedding)
-        return np.array(embeddings)
-    def get_scores(self, input_seqs: list):
-        scores = np.zeros(len(input_seqs))
-        features = self.generate_embeddings(input_seqs)
-        if len(features) == 0:
-            return scores
-        features = np.nan_to_num(features, nan=0.)
-        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
-        features = xgb.DMatrix(features)
-        scores = self.predictor.predict(features)
-        # return the probability of it being not hemolytic
-        return scores
-    def __call__(self, input_seqs: list):
-        scores = self.get_scores(input_seqs)
-        return scores
-def unittest():
-    nf = Nonfouling()
-    seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
-    scores = nf(input_seqs=seq)
-    print(scores)
-if __name__ == '__main__':
-    unittest()

functions/permeability.py DELETED Viewed

@@ -1,167 +0,0 @@
-import sys
-import os
-import xgboost as xgb
-import torch
-import numpy as np
-from transformers import AutoModelForMaskedLM
-from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
-import warnings
-import numpy as np
-from rdkit.Chem import Descriptors, rdMolDescriptors
-from rdkit import Chem, rdBase, DataStructs
-from rdkit.Chem import AllChem
-from typing import List
-base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
-rdBase.DisableLog('rdApp.error')
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-def fingerprints_from_smiles(smiles: List, size=2048):
-    """ Create ECFP fingerprints of smiles, with validity check """
-    fps = []
-    valid_mask = []
-    for i, smile in enumerate(smiles):
-        mol = Chem.MolFromSmiles(smile)
-        valid_mask.append(int(mol is not None))
-        fp = fingerprints_from_mol(mol, size=size) if mol else np.zeros((1, size))
-        fps.append(fp)
-    fps = np.concatenate(fps, axis=0)
-    return fps, valid_mask
-def fingerprints_from_mol(molecule, radius=3, size=2048, hashed=False):
-    """ Create ECFP fingerprint of a molecule """
-    if hashed:
-        fp_bits = AllChem.GetHashedMorganFingerprint(molecule, radius, nBits=size)
-    else:
-        fp_bits = AllChem.GetMorganFingerprintAsBitVect(molecule, radius, nBits=size)
-    fp_np = np.zeros((1,))
-    DataStructs.ConvertToNumpyArray(fp_bits, fp_np)
-    return fp_np.reshape(1, -1)
-def getMolDescriptors(mol, missingVal=0):
-    """ calculate the full list of descriptors for a molecule """
-    values, names = [], []
-    for nm, fn in Descriptors._descList:
-        try:
-            val = fn(mol)
-        except:
-            val = missingVal
-        values.append(val)
-        names.append(nm)
-    custom_descriptors = {'hydrogen-bond donors': rdMolDescriptors.CalcNumLipinskiHBD,
-                          'hydrogen-bond acceptors': rdMolDescriptors.CalcNumLipinskiHBA,
-                          'rotatable bonds': rdMolDescriptors.CalcNumRotatableBonds,}
-    for nm, fn in custom_descriptors.items():
-        try:
-            val = fn(mol)
-        except:
-            val = missingVal
-        values.append(val)
-        names.append(nm)
-    return values, names
-def get_pep_dps_from_smi(smi):
-    try:
-        mol = Chem.MolFromSmiles(smi)
-    except:
-        print(f"convert smi {smi} to molecule failed!")
-        mol = None
-    dps, _ = getMolDescriptors(mol)
-    return np.array(dps)
-def get_pep_dps(smi_list):
-    if len(smi_list) == 0:
-        return np.zeros((0, 213))
-    return np.array([get_pep_dps_from_smi(smi) for smi in smi_list])
-def check_smi_validity(smiles: list):
-    valid_smi, valid_idx = [], []
-    for idx, smi in enumerate(smiles):
-        try:
-            mol = Chem.MolFromSmiles(smi) if smi else None
-            if mol:
-                valid_smi.append(smi)
-                valid_idx.append(idx)
-        except Exception as e:
-            # logger.debug(f'Error: {e} in smiles {smi}')
-            pass
-    return valid_smi, valid_idx
-class Permeability:
-    def __init__(self):
-        self.predictor = xgb.Booster(model_file=f'{base_path}/src/permeability/best_model.json')
-        self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
-        self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
-                                              f'{base_path}/functions/tokenizer/new_splits.txt')
-    def generate_embeddings(self, sequences):
-        embeddings = []
-        for sequence in sequences:
-            tokenized = self.tokenizer(sequence, return_tensors='pt')
-            with torch.no_grad():
-                output = self.emb_model(**tokenized)
-            # Mean pooling across sequence length
-            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
-            embeddings.append(embedding)
-        return np.array(embeddings)
-    def get_features(self, input_seqs: list, dps=False, fps=False):
-        #valid_smiles, valid_idxes = check_smi_validity(input_seqs)
-        if fps:
-            fingerprints = fingerprints_from_smiles(input_seqs)[0]
-        else:
-            fingerprints = torch.empty((len(input_seqs), 0))
-        if dps:
-            descriptors = get_pep_dps(input_seqs)
-        else:
-            descriptors = torch.empty((len(input_seqs), 0))
-        embeddings = self.generate_embeddings(input_seqs)
-        # logger.debug(f'X_fps.shape: {X_fps.shape}, X_dps.shape: {X_dps.shape}')
-        features = np.concatenate([fingerprints, descriptors, embeddings], axis=1)
-        return features
-    def get_scores(self, input_seqs: list):
-        scores = -10 * np.ones(len(input_seqs))
-        features = self.get_features(input_seqs)
-        if len(features) == 0:
-            return scores
-        features = np.nan_to_num(features, nan=0.)
-        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
-        features = xgb.DMatrix(features)
-        scores = self.predictor.predict(features)
-        return scores
-    def __call__(self, input_seqs: list):
-        scores = self.get_scores(input_seqs)
-        return scores
-def unittest():
-    permeability = Permeability()
-    seq = ['N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1cNc2c1cc(O)cc2)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](Cc1ccccc1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H]([C@@H](O)C(C)C)C(=O)N[C@@H](Cc1ccc(O)cc1)C(=O)N[C@H](CC(=CN2)C1=C2C=CC=C1)C(=O)O']
-    scores = permeability(input_seqs=seq)
-    print(scores)
-if __name__ == '__main__':
-    unittest()

functions/solubility.py DELETED Viewed

@@ -1,68 +0,0 @@
-import sys
-import os
-import xgboost as xgb
-import torch
-import numpy as np
-from transformers import AutoModelForMaskedLM
-from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
-import warnings
-import numpy as np
-from rdkit.Chem import Descriptors, rdMolDescriptors
-from rdkit import Chem, rdBase, DataStructs
-from rdkit.Chem import AllChem
-from typing import List
-from transformers import AutoModelForMaskedLM
-rdBase.DisableLog('rdApp.error')
-warnings.filterwarnings("ignore", category=DeprecationWarning)
-warnings.filterwarnings("ignore", category=UserWarning)
-warnings.filterwarnings("ignore", category=FutureWarning)
-base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
-class Solubility:
-    def __init__(self):
-        self.predictor = xgb.Booster(model_file=f'{base_path}/src/solubility/best_model_f1.json')
-        self.emb_model = AutoModelForMaskedLM.from_pretrained('aaronfeller/PeptideCLM-23M-all').roformer
-        self.tokenizer = SMILES_SPE_Tokenizer(f'{base_path}/functions/tokenizer/new_vocab.txt',
-                                              f'{base_path}/functions/tokenizer/new_splits.txt')
-    def generate_embeddings(self, sequences):
-        embeddings = []
-        for sequence in sequences:
-            tokenized = self.tokenizer(sequence, return_tensors='pt')
-            with torch.no_grad():
-                output = self.emb_model(**tokenized)
-            # Mean pooling across sequence length
-            embedding = output.last_hidden_state.mean(dim=1).squeeze(0).cpu().numpy()
-            embeddings.append(embedding)
-        return np.array(embeddings)
-    def get_scores(self, input_seqs: list):
-        scores = np.zeros(len(input_seqs))
-        features = self.generate_embeddings(input_seqs)
-        if len(features) == 0:
-            return scores
-        features = np.nan_to_num(features, nan=0.)
-        features = np.clip(features, np.finfo(np.float32).min, np.finfo(np.float32).max)
-        features = xgb.DMatrix(features)
-        scores = self.predictor.predict(features)
-        return scores
-    def __call__(self, input_seqs: list):
-        scores = self.get_scores(input_seqs)
-        return scores
-def unittest():
-    solubility = Solubility()
-    seq = ["NCC(=O)N[C@H](CS)C(=O)N[C@@H](CO)C(=O)NCC(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CC(=O)N)C(=O)N[C@@H](CC(=CN2)C1=C2C=CC=C1)C(=O)N[C@@H](c1ccc(cc1)F)C(=O)N[C@@H]([C@H](CC)C)C(=O)N[C@@H](CCCO)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CO)C(=O)O"]
-    scores = solubility(input_seqs=seq)
-    print(scores)
-if __name__ == '__main__':
-    unittest()

functions/tokenizer/__pycache__/my_tokenizers.cpython-310.pyc DELETED Viewed

Binary file (15.5 kB)

functions/tokenizer/my_tokenizers.py DELETED Viewed

@@ -1,398 +0,0 @@
-import collections
-import logging
-import os
-import re
-import codecs
-import unicodedata
-from typing import List, Optional
-from transformers import PreTrainedTokenizer
-from SmilesPE.tokenizer import SPE_Tokenizer
-def load_vocab(vocab_file):
-    """Loads a vocabulary file into a dictionary."""
-    vocab = collections.OrderedDict()
-    with open(vocab_file, "r", encoding="utf-8") as reader:
-        tokens = reader.readlines()
-    for index, token in enumerate(tokens):
-        token = token.rstrip("\n")
-        vocab[token] = index
-    return vocab
-class Atomwise_Tokenizer(object):
-    """Run atom-level SMILES tokenization"""
-    def __init__(self):
-        """ Constructs a atom-level Tokenizer.
-        """
-        # self.regex_pattern = r"(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
-        self.regex_pattern = r"(\([^\(\)]{0,4}\)|\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\|\/\/?|:|~|@|\?|>>?|\*|\$|\%[0-9]{2}|[0-9])"
-        self.regex = re.compile(self.regex_pattern)
-    def tokenize(self, text):
-        """ Basic Tokenization of a SMILES.
-        """
-        tokens = [token for token in self.regex.findall(text)]
-        return tokens
-class SMILES_SPE_Tokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-    should refer to the superclass for more information regarding methods.
-    Args:
-        vocab_file (:obj:`string`):
-            File containing the vocabulary.
-        spe_file (:obj:`string`):
-            File containing the trained SMILES Pair Encoding vocabulary.
-        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
-            for sequence classification or for a text and a question for question answering.
-            It is also used as the last token of a sequence built with special tokens.
-        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
-            The classifier token which is used when doing sequence classification (classification of the whole
-            sequence instead of per-token classification). It is the first token of the sequence when built with
-            special tokens.
-        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-    """
-    def __init__(self, vocab_file, spe_file,
-                unk_token="[UNK]",
-                sep_token="[SEP]",
-                pad_token="[PAD]",
-                cls_token="[CLS]",
-                mask_token="[MASK]",
-                **kwargs):
-        if not os.path.isfile(vocab_file):
-            raise ValueError("Can't find a vocabulary file at path '{}'.".format(vocab_file))
-        if not os.path.isfile(spe_file):
-            raise ValueError("Can't find a SPE vocabulary file at path '{}'.".format(spe_file))
-        self.vocab = load_vocab(vocab_file)
-        self.spe_vocab = open(spe_file, 'r', encoding='utf-8')
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.spe_tokenizer = SPE_Tokenizer(self.spe_vocab)
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs)
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-    def _tokenize(self, text):
-        return self.spe_tokenizer.tokenize(text).split(' ')
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-    def decode(self, token_ids, skip_special_tokens=False, clean_up_tokenization_spaces=True):
-        text = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
-        return self.convert_tokens_to_string(text)
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
-        ::
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-    def save_vocabulary(self, vocab_path):
-        """
-        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
-        Args:
-            vocab_path (:obj:`str`):
-                The directory in which to save the vocabulary.
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)
-class SMILES_Atomwise_Tokenizer(PreTrainedTokenizer):
-    r"""
-    Constructs a SMILES tokenizer. Based on SMILES Pair Encoding (https://github.com/XinhaoLi74/SmilesPE).
-    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
-    should refer to the superclass for more information regarding methods.
-    Args:
-        vocab_file (:obj:`string`):
-            File containing the vocabulary.
-        unk_token (:obj:`string`, `optional`, defaults to "[UNK]"):
-            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
-            token instead.
-        sep_token (:obj:`string`, `optional`, defaults to "[SEP]"):
-            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
-            for sequence classification or for a text and a question for question answering.
-            It is also used as the last token of a sequence built with special tokens.
-        pad_token (:obj:`string`, `optional`, defaults to "[PAD]"):
-            The token used for padding, for example when batching sequences of different lengths.
-        cls_token (:obj:`string`, `optional`, defaults to "[CLS]"):
-            The classifier token which is used when doing sequence classification (classification of the whole
-            sequence instead of per-token classification). It is the first token of the sequence when built with
-            special tokens.
-        mask_token (:obj:`string`, `optional`, defaults to "[MASK]"):
-            The token used for masking values. This is the token used when training this model with masked language
-            modeling. This is the token which the model will try to predict.
-    """
-    def __init__(
-        self,
-        vocab_file,
-        unk_token="[UNK]",
-        sep_token="[SEP]",
-        pad_token="[PAD]",
-        cls_token="[CLS]",
-        mask_token="[MASK]",
-        **kwargs
-    ):
-        super().__init__(
-            unk_token=unk_token,
-            sep_token=sep_token,
-            pad_token=pad_token,
-            cls_token=cls_token,
-            mask_token=mask_token,
-            **kwargs,
-        )
-        if not os.path.isfile(vocab_file):
-            raise ValueError(
-                "Can't find a vocabulary file at path '{}'.".format(vocab_file)
-            )
-        self.vocab = load_vocab(vocab_file)
-        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
-        self.tokenizer = Atomwise_Tokenizer()
-    @property
-    def vocab_size(self):
-        return len(self.vocab)
-    def get_vocab(self):
-        return dict(self.vocab, **self.added_tokens_encoder)
-    def _tokenize(self, text):
-        return self.tokenizer.tokenize(text)
-    def _convert_token_to_id(self, token):
-        """ Converts a token (str) in an id using the vocab. """
-        return self.vocab.get(token, self.vocab.get(self.unk_token))
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.ids_to_tokens.get(index, self.unk_token)
-    def convert_tokens_to_string(self, tokens):
-        """ Converts a sequence of tokens (string) in a single string. """
-        out_string = " ".join(tokens).replace(" ##", "").strip()
-        return out_string
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
-        by concatenating and adding special tokens.
-        A BERT sequence has the following format:
-        - single sequence: ``[CLS] X [SEP]``
-        - pair of sequences: ``[CLS] A [SEP] B [SEP]``
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of IDs to which the special tokens will be added
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            :obj:`List[int]`: list of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
-        """
-        if token_ids_1 is None:
-            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
-        cls = [self.cls_token_id]
-        sep = [self.sep_token_id]
-        return cls + token_ids_0 + sep + token_ids_1 + sep
-    def get_special_tokens_mask(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
-    ) -> List[int]:
-        """
-        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
-        special tokens using the tokenizer ``prepare_for_model`` method.
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
-                Set to True if the token list is already formatted with special tokens for the model
-        Returns:
-            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
-        """
-        if already_has_special_tokens:
-            if token_ids_1 is not None:
-                raise ValueError(
-                    "You should not supply a second sequence if the provided sequence of "
-                    "ids is already formated with special tokens for the model."
-                )
-            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))
-        if token_ids_1 is not None:
-            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
-        return [1] + ([0] * len(token_ids_0)) + [1]
-    def create_token_type_ids_from_sequences(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Creates a mask from the two sequences passed to be used in a sequence-pair classification task.
-        A BERT sequence pair mask has the following format:
-        ::
-            0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1
-            | first sequence    | second sequence |
-        if token_ids_1 is None, only returns the first portion of the mask (0's).
-        Args:
-            token_ids_0 (:obj:`List[int]`):
-                List of ids.
-            token_ids_1 (:obj:`List[int]`, `optional`, defaults to :obj:`None`):
-                Optional second list of IDs for sequence pairs.
-        Returns:
-            :obj:`List[int]`: List of `token type IDs <../glossary.html#token-type-ids>`_ according to the given
-            sequence(s).
-        """
-        sep = [self.sep_token_id]
-        cls = [self.cls_token_id]
-        if token_ids_1 is None:
-            return len(cls + token_ids_0 + sep) * [0]
-        return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1]
-    def save_vocabulary(self, vocab_path):
-        """
-        Save the sentencepiece vocabulary (copy original file) and special tokens file to a directory.
-        Args:
-            vocab_path (:obj:`str`):
-                The directory in which to save the vocabulary.
-        Returns:
-            :obj:`Tuple(str)`: Paths to the files saved.
-        """
-        index = 0
-        if os.path.isdir(vocab_path):
-            vocab_file = os.path.join(vocab_path, VOCAB_FILES_NAMES["vocab_file"])
-        else:
-            vocab_file = vocab_path
-        with open(vocab_file, "w", encoding="utf-8") as writer:
-            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
-                if index != token_index:
-                    logger.warning(
-                        "Saving vocabulary to {}: vocabulary indices are not consecutive."
-                        " Please check that the vocabulary is not corrupted!".format(vocab_file)
-                    )
-                    index = token_index
-                writer.write(token + "\n")
-                index += 1
-        return (vocab_file,)

functions/tokenizer/new_splits.txt DELETED Viewed

@@ -1,159 +0,0 @@
-c 1
-c 2
-c 3
-c 4
-c 5
-c 6
-c 7
-c 8
-c 9
-( c1
-( c2
-c1 )
-c2 )
-n 1
-n 2
-n 3
-n 4
-n 5
-n 6
-n 7
-n 8
-n 9
-( n1
-( n2
-n1 )
-n2 )
-O 1
-O 2
-O 3
-O 4
-O 5
-O 6
-O 7
-O 8
-O 9
-( O1
-( O2
-O2 )
-O2 )
-= O
-= C
-= c
-= N
-= n
-=C C
-=C N
-=C c
-=c c
-=N C
-=N c
-=n C
-=n c
-# N
-# C
-#N C
-#C C
-#C N
-#N N
-( C
-C )
-( O
-O )
-( N
-N )
-Br c
-( =O
-(=O )
-C (=O)
-C =O
-C =N
-C #N
-C #C
-C C
-CC C
-CC N
-CC O
-CC S
-CC c
-CC n
-C N
-CN C
-CN c
-C O
-CO C
-CO N
-CO c
-C S
-CS C
-CS S
-CS c
-C c
-Cl c
-C n
-F c
-N C
-NC C
-NC c
-N N
-N O
-N c
-N n
-O C
-OC C
-OC O
-OC c
-O N
-O O
-O c
-S C
-SC C
-SC c
-S S
-S c
-c c
-cc c
-cc n
-cc o
-cc s
-cc cc
-c n
-cn c
-cn n
-c o
-co c
-c s
-cs c
-cs n
-n c
-nc c
-nc n
-nc o
-nc s
-n n
-nn c
-nn n
-n o
-no c
-no n
-n s
-ns c
-ns n
-o c
-oc c
-o n
-s c
-sc c
-sc n
-s n
-N P
-P N
-C P
-P C
-N S
-S N
-C S
-S C
-S P
-P S
-C I

functions/tokenizer/new_vocab.txt DELETED Viewed

@@ -1,586 +0,0 @@
-[PAD]
-[UNK]
-[CLS]
-[SEP]
-[MASK]
-#
-%
-(
-)
-+
--
-/
-0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-=
-@
-A
-B
-Br
-Brc
-C
-CC
-CCC
-CCN
-CCO
-CCS
-CCc
-CCn
-CN
-CNC
-CNc
-CO
-COC
-CON
-COc
-CS
-CSC
-CSS
-CSc
-Cc
-Cl
-Clc
-Cn
-F
-Fc
-H
-I
-K
-L
-M
-N
-NC
-NCC
-NCc
-NN
-NO
-Nc
-Nn
-O
-OC
-OCC
-OCO
-OCc
-ON
-OO
-Oc
-P
-R
-S
-SC
-SCC
-SCc
-SS
-Sc
-T
-X
-Z
-[
-\\
-(/
-]
-a
-b
-c
-cc
-ccc
-ccn
-cco
-ccs
-cn
-cnc
-cnn
-co
-coc
-cs
-csc
-csn
-e
-g
-i
-l
-n
-nc
-ncc
-ncn
-nco
-ncs
-nn
-nnc
-nnn
-no
-noc
-non
-ns
-nsc
-nsn
-o
-oc
-occ
-on
-p
-r
-s
-sc
-scc
-scn
-sn
-t
-c1
-c2
-c3
-c4
-c5
-c6
-c7
-c8
-c9
-n1
-n2
-n3
-n4
-n5
-n6
-n7
-n8
-n9
-O1
-O2
-O3
-O4
-O5
-O6
-O7
-O8
-O9
-(c1
-(c2
-c1)
-c2)
-(n1
-(n2
-n1)
-n2)
-(O1
-(O2
-O2)
-=O
-=C
-=c
-=N
-=n
-=CC
-=CN
-=Cc
-=cc
-=NC
-=Nc
-=nC
-=nc
-#C
-#CC
-#CN
-#N
-#NC
-#NN
-(C
-C)
-(O
-O)
-(N
-N)
-NP
-PN
-CP
-PC
-NS
-SN
-SP
-PS
-C(=O)
-(/Br)
-(/C#N)
-(/C)
-(/C=N)
-(/C=O)
-(/CBr)
-(/CC)
-(/CCC)
-(/CCF)
-(/CCN)
-(/CCO)
-(/CCl)
-(/CI)
-(/CN)
-(/CO)
-(/CS)
-(/Cl)
-(/F)
-(/I)
-(/N)
-(/NC)
-(/NCC)
-(/NO)
-(/O)
-(/OC)
-(/OCC)
-(/S)
-(/SC)
-(=C)
-(=C/C)
-(=C/F)
-(=C/I)
-(=C/N)
-(=C/O)
-(=CBr)
-(=CC)
-(=CCF)
-(=CCN)
-(=CCO)
-(=CCl)
-(=CF)
-(=CI)
-(=CN)
-(=CO)
-(=C\\C)
-(=C\\F)
-(=C\\I)
-(=C\\N)
-(=C\\O)
-(=N)
-(=N/C)
-(=N/N)
-(=N/O)
-(=NBr)
-(=NC)
-(=NCC)
-(=NCl)
-(=NN)
-(=NO)
-(=NOC)
-(=N\\C)
-(=N\\N)
-(=N\\O)
-(=O)
-(=S)
-(B)
-(Br)
-(C#C)
-(C#CC)
-(C#CI)
-(C#CO)
-(C#N)
-(C#SN)
-(C)
-(C=C)
-(C=CF)
-(C=CI)
-(C=N)
-(C=NN)
-(C=NO)
-(C=O)
-(C=S)
-(CBr)
-(CC#C)
-(CC#N)
-(CC)
-(CC=C)
-(CC=O)
-(CCBr)
-(CCC)
-(CCCC)
-(CCCF)
-(CCCI)
-(CCCN)
-(CCCO)
-(CCCS)
-(CCCl)
-(CCF)
-(CCI)
-(CCN)
-(CCNC)
-(CCNN)
-(CCNO)
-(CCO)
-(CCOC)
-(CCON)
-(CCS)
-(CCSC)
-(CCl)
-(CF)
-(CI)
-(CN)
-(CN=O)
-(CNC)
-(CNCC)
-(CNCO)
-(CNN)
-(CNNC)
-(CNO)
-(CNOC)
-(CO)
-(COC)
-(COCC)
-(COCI)
-(COCN)
-(COCO)
-(COF)
-(CON)
-(COO)
-(CS)
-(CSC)
-(CSCC)
-(CSCF)
-(CSO)
-(Cl)
-(F)
-(I)
-(N)
-(N=N)
-(N=NO)
-(N=O)
-(N=S)
-(NBr)
-(NC#N)
-(NC)
-(NC=N)
-(NC=O)
-(NC=S)
-(NCBr)
-(NCC)
-(NCCC)
-(NCCF)
-(NCCN)
-(NCCO)
-(NCCS)
-(NCCl)
-(NCNC)
-(NCO)
-(NCS)
-(NCl)
-(NN)
-(NN=O)
-(NNC)
-(NO)
-(NOC)
-(O)
-(OC#N)
-(OC)
-(OC=C)
-(OC=O)
-(OC=S)
-(OCBr)
-(OCC)
-(OCCC)
-(OCCF)
-(OCCI)
-(OCCN)
-(OCCO)
-(OCCS)
-(OCCl)
-(OCF)
-(OCI)
-(OCO)
-(OCOC)
-(OCON)
-(OCSC)
-(OCl)
-(OI)
-(ON)
-(OO)
-(OOC)
-(OOCC)
-(OOSN)
-(OSC)
-(P)
-(S)
-(SC#N)
-(SC)
-(SCC)
-(SCCC)
-(SCCF)
-(SCCN)
-(SCCO)
-(SCCS)
-(SCCl)
-(SCF)
-(SCN)
-(SCOC)
-(SCSC)
-(SCl)
-(SI)
-(SN)
-(SN=O)
-(SO)
-(SOC)
-(SOOO)
-(SS)
-(SSC)
-(SSCC)
-([At])
-([O-])
-([O])
-([S-])
-(\\Br)
-(\\C#N)
-(\\C)
-(\\C=N)
-(\\C=O)
-(\\CBr)
-(\\CC)
-(\\CCC)
-(\\CCO)
-(\\CCl)
-(\\CF)
-(\\CN)
-(\\CNC)
-(\\CO)
-(\\COC)
-(\\Cl)
-(\\F)
-(\\I)
-(\\N)
-(\\NC)
-(\\NCC)
-(\\NN)
-(\\NO)
-(\\NOC)
-(\\O)
-(\\OC)
-(\\OCC)
-(\\ON)
-(\\S)
-(\\SC)
-(\\SCC)
-[Ag+]
-[Ag-4]
-[Ag]
-[Al-3]
-[Al]
-[As+]
-[AsH3]
-[AsH]
-[As]
-[At]
-[B-]
-[B@-]
-[B@@-]
-[BH-]
-[BH2-]
-[BH3-]
-[B]
-[Ba]
-[Br+2]
-[BrH]
-[Br]
-[C+]
-[C-]
-[C@@H]
-[C@@]
-[C@H]
-[C@]
-[CH-]
-[CH2]
-[CH3]
-[CH]
-[C]
-[CaH2]
-[Ca]
-[Cl+2]
-[Cl+3]
-[Cl+]
-[Cs]
-[FH]
-[F]
-[H]
-[He]
-[I+2]
-[I+3]
-[I+]
-[IH]
-[I]
-[K]
-[Kr]
-[Li+]
-[LiH]
-[MgH2]
-[Mg]
-[N+]
-[N-]
-[N@+]
-[N@@+]
-[N@@]
-[N@]
-[NH+]
-[NH-]
-[NH2+]
-[NH3]
-[NH]
-[N]
-[Na]
-[O+]
-[O-]
-[OH+]
-[OH2]
-[OH]
-[O]
-[P+]
-[P@+]
-[P@@+]
-[P@@]
-[P@]
-[PH2]
-[PH]
-[P]
-[Ra]
-[Rb]
-[S+]
-[S-]
-[S@+]
-[S@@+]
-[S@@]
-[S@]
-[SH+]
-[SH2]
-[SH]
-[S]
-[Se+]
-[Se-2]
-[SeH2]
-[SeH]
-[Se]
-[Si@]
-[SiH2]
-[SiH]
-[Si]
-[SrH2]
-[TeH]
-[Te]
-[Xe]
-[Zn+2]
-[Zn-2]
-[Zn]
-[b-]
-[c+]
-[c-]
-[cH-]
-[cH]
-[c]
-[n+]
-[n-]
-[nH]
-[n]
-[o+]
-[s+]
-[se+]
-[se]
-[te+]
-[te]

load.py → inference.py RENAMED Viewed

@@ -48,16 +48,18 @@ def normalize_property_key(name: str) -> str:
     n = name.strip().lower()
     n = re.sub(r"\s*\(.*?\)\s*", "", n)
     n = n.replace("-", "_").replace(" ", "_")
     if "permeability" in n and "pampa" not in n and "caco" not in n:
         return "permeability_penetrance"
     if n == "binding_affinity":
         return "binding_affinity"
-    if n == "halflife":
-        return "half_life"
     if n == "non_fouling":
         return "nf"
     return n
 def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
     """
     Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
@@ -111,7 +113,8 @@ MODEL_ALIAS = {
     "XGB": "xgb",
     "XGB_REG": "xgb_reg",
     "POOLED": "pooled",
-    "UNPOOLED": "unpooled"
 }
 def canon_model(label: Optional[str]) -> Optional[str]:
     if label is None:
@@ -235,8 +238,25 @@ def build_torch_model_from_ckpt(model_name: str, ckpt: dict, device: torch.devic
         model = CNNHead(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
                         layers=int(params["layers"]), dropout=dropout)
     elif model_name == "transformer":
-        model = TransformerHead(in_dim=in_dim, d_model=int(params["d_model"]), nhead=int(params["nhead"]),
-                                layers=int(params["layers"]), ff=int(params["ff"]), dropout=dropout)
     else:
         raise ValueError(f"Unknown NN model_name={model_name}")
@@ -648,13 +668,21 @@ class PeptiVersePredictor:
         self._load_all_best_models()
     def _resolve_dir(self, prop_key: str, model_name: str, mode: str) -> Path:
-        """
-        Usual layout: training_classifiers/<prop>/<model>_<mode>/
-        Fallbacks:
-          - training_classifiers/<prop>/<model>/
-          - training_classifiers/<prop>/<model>_wt
-        """
-        base = self.training_root / prop_key
         candidates = [
             base / f"{model_name}_{mode}",
             base / model_name,
@@ -667,7 +695,11 @@ class PeptiVersePredictor:
         for d in candidates:
             if d.exists():
                 return d
-        raise FileNotFoundError(f"Cannot find model directory for {prop_key} {model_name} {mode}. Tried: {candidates}")
     def _load_all_best_models(self):
         for prop_key, row in self.manifest.items():
@@ -705,15 +737,24 @@ class PeptiVersePredictor:
                     self.models[(prop_key, mode)] = obj
                 else:
                     # rebuild NN architecture
-                    self.models[(prop_key, mode)] = build_torch_model_from_ckpt(m, obj, self.device)
                 self.meta[(prop_key, mode)] = {
-                    "task_type": row.task_type,
-                    "threshold": thr,
-                    "artifact": str(art),
-                    "model_name": m,
-                    "kind": kind,
-                }
     def _get_features_for_model(self, prop_key: str, mode: str, input_str: str):
         """
@@ -769,6 +810,14 @@ class PeptiVersePredictor:
             X, M = self._get_features_for_model(prop_key, mode, input_str)
             with torch.no_grad():
                 y = model(X, M).squeeze().float().cpu().item()
             if task_type == "classifier":
                 prob = float(1.0 / (1.0 + np.exp(-y)))  # sigmoid(logit)
                 out = {"property": prop_key, "mode": mode, "score": prob}
@@ -779,15 +828,22 @@ class PeptiVersePredictor:
             else:
                 return {"property": prop_key, "mode": mode, "score": float(y)}
-        # xgb path
         if kind == "xgb":
-            feats = self._get_features_for_model(prop_key, mode, input_str)  # (1,H)
             dmat = xgb.DMatrix(feats)
             pred = float(model.predict(dmat)[0])
             out = {"property": prop_key, "mode": mode, "score": pred}
-            if task_type == "classifier" and thr is not None:
-                out["label"] = int(pred >= float(thr))
-                out["threshold"] = float(thr)
             return out
         # joblib path (svm/enet/svr)
@@ -850,7 +906,7 @@ class PeptiVersePredictor:
                 cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
                 cls_thr = affinity_to_class(affinity)
-        names = {0: "High (≥9)", 1: "Moderate (7–9)", 2: "Low (<7)"}
         return {
             "property": "binding_affinity",
             "mode": mode,
@@ -861,14 +917,10 @@ class PeptiVersePredictor:
         }
-# -----------------------------
-# Minimal usage
-# -----------------------------
 if __name__ == "__main__":
-    # Example:
     predictor = PeptiVersePredictor(
        manifest_path="best_models.txt",
-       classifier_weight_root="/vast/projects/pranam/lab/yz927/projects/Classifier_Weight"
      )
     print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))
     print(predictor.predict_binding_affinity("wt", target_seq="...", binder_str="..."))
@@ -879,8 +931,8 @@ if __name__ == "__main__":
     wt = WTEmbedder(device)
     sm = SMILESEmbedder(device,
-        vocab_path="/home/enol/PeptideGym/Data_split/tokenizer/new_vocab.txt",
-        splits_path="/home/enol/PeptideGym/Data_split/tokenizer/new_splits.txt"
     )
     p = wt.pooled("GIGAVLKVLTTGLPALISWIKRKRQQ")        # (1,1280)

     n = name.strip().lower()
     n = re.sub(r"\s*\(.*?\)\s*", "", n)
     n = n.replace("-", "_").replace(" ", "_")
     if "permeability" in n and "pampa" not in n and "caco" not in n:
         return "permeability_penetrance"
     if n == "binding_affinity":
         return "binding_affinity"
+    if n in {"halflife", "half_life"}:
+        return "halflife"
     if n == "non_fouling":
         return "nf"
     return n
 def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
     """
     Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
     "XGB": "xgb",
     "XGB_REG": "xgb_reg",
     "POOLED": "pooled",
+    "UNPOOLED": "unpooled",
+    "TRANSFORMER_WT_LOG": "transformer_wt_log",
 }
 def canon_model(label: Optional[str]) -> Optional[str]:
     if label is None:
         model = CNNHead(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
                         layers=int(params["layers"]), dropout=dropout)
     elif model_name == "transformer":
+        d_model = (
+            params.get("d_model")
+            or params.get("hidden")
+            or params.get("hidden_dim")
+        )
+        if d_model is None:
+            raise KeyError(
+                f"Transformer checkpoint missing d_model/hidden. "
+                f"Available keys: {list(params.keys())}"
+            )
+        model = TransformerHead(
+            in_dim=in_dim,
+            d_model=int(d_model),
+            nhead=int(params["nhead"]),
+            layers=int(params["layers"]),
+            ff=int(params.get("ff", 4 * int(d_model))),
+            dropout=dropout
+        )
     else:
         raise ValueError(f"Unknown NN model_name={model_name}")
         self._load_all_best_models()
     def _resolve_dir(self, prop_key: str, model_name: str, mode: str) -> Path:
+        # map halflife -> half_life folder on disk (common layout)
+        disk_prop = "half_life" if prop_key == "halflife" else prop_key
+        base = self.training_root / disk_prop
+        # special handling for halflife xgb_wt_log / xgb_smiles
+        if prop_key == "halflife" and model_name in {"xgb_wt_log", "xgb_smiles"}:
+            d = base / model_name
+            if d.exists():
+                return d
+        if prop_key == "halflife" and model_name == "xgb":
+            d = base / ("xgb_wt_log" if mode == "wt" else "xgb_smiles")
+            if d.exists():
+                return d
         candidates = [
             base / f"{model_name}_{mode}",
             base / model_name,
         for d in candidates:
             if d.exists():
                 return d
+        raise FileNotFoundError(
+            f"Cannot find model directory for {prop_key} {model_name} {mode}. Tried: {candidates}"
+        )
     def _load_all_best_models(self):
         for prop_key, row in self.manifest.items():
                     self.models[(prop_key, mode)] = obj
                 else:
                     # rebuild NN architecture
+                    arch = m
+                    if arch.startswith("transformer"):
+                        arch = "transformer"
+                    elif arch.startswith("mlp"):
+                        arch = "mlp"
+                    elif arch.startswith("cnn"):
+                        arch = "cnn"
+                    self.models[(prop_key, mode)] = build_torch_model_from_ckpt(arch, obj, self.device)
                 self.meta[(prop_key, mode)] = {
+                        "task_type": row.task_type,
+                        "threshold": thr,
+                        "artifact": str(art),
+                        "model_name": m,
+                        "kind": kind,
+                    }
     def _get_features_for_model(self, prop_key: str, mode: str, input_str: str):
         """
             X, M = self._get_features_for_model(prop_key, mode, input_str)
             with torch.no_grad():
                 y = model(X, M).squeeze().float().cpu().item()
+            # invert log1p(hours) ONLY for WT half-life log models
+            model_name = meta.get("model_name", "")
+            if (
+                prop_key == "halflife"
+                and mode == "wt"
+                and model_name in {"xgb_wt_log", "transformer_wt_log"}
+            ):
+                y = float(np.expm1(y))
             if task_type == "classifier":
                 prob = float(1.0 / (1.0 + np.exp(-y)))  # sigmoid(logit)
                 out = {"property": prop_key, "mode": mode, "score": prob}
             else:
                 return {"property": prop_key, "mode": mode, "score": float(y)}
         if kind == "xgb":
+            feats = self._get_features_for_model(prop_key, mode, input_str)
             dmat = xgb.DMatrix(feats)
             pred = float(model.predict(dmat)[0])
+            # invert log1p(hours) ONLY for WT half-life log models
+            model_name = meta.get("model_name", "")
+            if (
+                prop_key == "halflife"
+                and mode == "wt"
+                and model_name in {"xgb_wt_log", "transformer_wt_log"}
+            ):
+                pred = float(np.expm1(pred))
             out = {"property": prop_key, "mode": mode, "score": pred}
             return out
         # joblib path (svm/enet/svr)
                 cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
                 cls_thr = affinity_to_class(affinity)
+        names = {0: "High (≥9)", 1: "Moderate (7-9)", 2: "Low (<7)"}
         return {
             "property": "binding_affinity",
             "mode": mode,
         }
 if __name__ == "__main__":
     predictor = PeptiVersePredictor(
        manifest_path="best_models.txt",
+       classifier_weight_root="./Classifier_Weight"
      )
     print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))
     print(predictor.predict_binding_affinity("wt", target_seq="...", binder_str="..."))
     wt = WTEmbedder(device)
     sm = SMILESEmbedder(device,
+        vocab_path="./tokeizner/new_vocab.txt",
+        splits_path="./tokenizer/new_splits.txt"
     )
     p = wt.pooled("GIGAVLKVLTTGLPALISWIKRKRQQ")        # (1,1280)

models/best_model_half_life.pth DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f80f1b20e90ba30503804c738aad4b3bb253424ff2e6e8a86c8e13a2fa1669f9
-size 2623795199

models/best_model_hemolysis.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:09b90730272d48f061bf41c79a5ae44f5c977f331d48600c8615852806308be1
-size 1938117

models/best_model_nonfouling.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bc50e344298c5db6f45ba65b09c61f80ee47c8b4b33f7193b068618520c948d1
-size 2275245

models/best_model_solubility.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:948ac245c158aacd51f36cc71f4ee7bbbf3568c92666c637689790a01677fa59
-size 3698748

models/binding_affinity_smiles.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:04986ccd078abd8f744d299b5e73c1e93bf4899896bb8d8f0e2bacbe0e8c6c97
-size 132487302

models/binding_affinity_unpooled.pt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fc28ae9f09b981b07547a773ca2e07f241cb08b3b8aa901e66627ff153f3aa8b
-size 2731670995

models/enhancer_class.ckpt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0cdbd02bc600847aa238967f00bc66882e515d3385e9eb278c1fa85818625492
-size 37598951

models/enhancer_class_hparams.yaml DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:579a2065f9976e9de5f4d90976973c049a33f135268961b76ebe6e38fd986450
-size 1814

models/hemolysis-xgboost_smiles.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d5a3c91f1bd746d6e7eda091147b555e760a5b2585423ce8d75990837e781b51
-size 288201

models/nonfouling-xgboost_smiles.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:269fe5d2a90e52a075949c9de281db5b65f97ea386369c687edb1656b8686381
-size 165817

models/permeability-xgboost_smiles.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4020f779e0d273fecc7a0dbe3dd43c40bbf028f76559c0f8d687a6da5a715267
-size 6343274

models/solubility-xgboost_smiles.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:49395cb4c55a08c9b9683c11133fbde3ef05cf81f4a28d3303c8cf8c06c55597
-size 279803

scoring_functions.py DELETED Viewed

@@ -1,103 +0,0 @@
-import sys
-import io
-import subprocess
-import warnings
-import numpy as np
-import pandas as pd
-from typing import List
-from loguru import logger
-from tqdm import tqdm
-from rdkit import Chem, rdBase, DataStructs
-from rdkit.Chem import AllChem
-import torch
-from functions.binding.binding import BindingAffinity
-from functions.permeability.permeability import Permeability
-from functions.solubility.solubility import Solubility
-from functions.hemolysis.hemolysis import Hemolysis
-from functions.nonfouling.nonfouling import Nonfouling
-class ScoringFunctions:
-    def __init__(self, score_func_names=None, prot_seqs=[]):
-        """
-        Class for generating score vectors given generated sequence
-        Args:
-            score_func_names: list of scoring function names to be evaluated
-            score_weights: weights to scale scores (default: 1)
-            target_protein: sequence of target protein binder
-        """
-        if score_func_names is None:
-            # just do unmasking based on validity of peptide bonds
-            self.score_func_names = []
-        else:
-            self.score_func_names = score_func_names
-        # self.weights = np.array([1] * len(self.score_func_names) if score_weights is None else score_weights)
-        # binding affinities
-        self.target_protein = prot_seqs
-        print(len(prot_seqs))
-        if ('binding_affinity1' in score_func_names) and (len(prot_seqs) == 1):
-            binding_affinity1 = BindingAffinity(prot_seqs[0])
-            binding_affinity2 = None
-        elif ('binding_affinity1' in score_func_names) and ('binding_affinity2' in score_func_names) and (len(prot_seqs) == 2):
-            binding_affinity1 = BindingAffinity(prot_seqs[0])
-            binding_affinity2 = BindingAffinity(prot_seqs[1])
-        else:
-            print("here")
-            binding_affinity1 = None
-            binding_affinity2 = None
-        permeability = Permeability()
-        sol = Solubility()
-        nonfouling = Nonfouling()
-        hemo = Hemolysis()
-        self.all_funcs = {'binding_affinity1': binding_affinity1,
-                          'binding_affinity2': binding_affinity2,
-                          'permeability': permeability,
-                          'nonfouling': nonfouling,
-                          'solubility': sol,
-                          'hemolysis': hemo
-                          }
-    def forward(self, input_seqs):
-        scores = []
-        for i, score_func in enumerate(self.score_func_names):
-            score = self.all_funcs[score_func](input_seqs = input_seqs)
-            scores.append(score)
-        # convert to numpy arrays with shape (num_sequences, num_functions)
-        scores = np.float32(scores).T
-        return scores
-    def __call__(self, input_seqs: list):
-        return self.forward(input_seqs)
-def unittest():
-    amhr = 'MLGSLGLWALLPTAVEAPPNRRTCVFFEAPGVRGSTKTLGELLDTGTELPRAIRCLYSRCCFGIWNLTQDRAQVEMQGCRDSDEPGCESLHCDPSPRAHPSPGSTLFTCSCGTDFCNANYSHLPPPGSPGTPGSQGPQAAPGESIWMALVLLGLFLLLLLLLGSIILALLQRKNYRVRGEPVPEPRPDSGRDWSVELQELPELCFSQVIREGGHAVVWAGQLQGKLVAIKAFPPRSVAQFQAERALYELPGLQHDHIVRFITASRGGPGRLLSGPLLVLELHPKGSLCHYLTQYTSDWGSSLRMALSLAQGLAFLHEERWQNGQYKPGIAHRDLSSQNVLIREDGSCAIGDLGLALVLPGLTQPPAWTPTQPQGPAAIMEAGTQRYMAPELLDKTLDLQDWGMALRRADIYSLALLLWEILSRCPDLRPDSSPPPFQLAYEAELGNTPTSDELWALAVQERRRPYIPSTWRCFATDPDGLRELLEDCWDADPEARLTAECVQQRLAALAHPQESHPFPESCPRGCPPLCPEDCTSIPAPTILPCRPQRSACHFSVQQGPCSRNPQPACTLSPV'
-    tfr = 'MMDQARSAFSNLFGGEPLSYTRFSLARQVDGDNSHVEMKLAVDEEENADNNTKANVTKPKRCSGSICYGTIAVIVFFLIGFMIGYLGYCKGVEPKTECERLAGTESPVREEPGEDFPAARRLYWDDLKRKLSEKLDSTDFTGTIKLLNENSYVPREAGSQKDENLALYVENQFREFKLSKVWRDQHFVKIQVKDSAQNSVIIVDKNGRLVYLVENPGGYVAYSKAATVTGKLVHANFGTKKDFEDLYTPVNGSIVIVRAGKITFAEKVANAESLNAIGVLIYMDQTKFPIVNAELSFFGHAHLGTGDPYTPGFPSFNHTQFPPSRSSGLPNIPVQTISRAAAEKLFGNMEGDCPSDWKTDSTCRMVTSESKNVKLTVSNVLKEIKILNIFGVIKGFVEPDHYVVVGAQRDAWGPGAAKSGVGTALLLKLAQMFSDMVLKDGFQPSRSIIFASWSAGDFGSVGATEWLEGYLSSLHLKAFTYINLDKAVLGTSNFKVSASPLLYTLIEKTMQNVKHPVTGQFLYQDSNWASKVEKLTLDNAAFPFLAYSGIPAVSFCFCEDTDYPYLGTTMDTYKELIERIPELNKVARAAAEVAGQFVIKLTHDVELNLDYERYNSQLLSFVRDLNQYRADIKEMGLSLQWLYSARGDFFRATSRLTTDFGNAEKTDRFVMKKLNDRVMRVEYHFLSPYVSPKESPFRHVFWGSGSHTLPALLENLKLRKQNNGAFNETLFRNQLALATWTIQGAANALSGDVWDIDNEF'
-    gfap = 'MERRRITSAARRSYVSSGEMMVGGLAPGRRLGPGTRLSLARMPPPLPTRVDFSLAGALNAGFKETRASERAEMMELNDRFASYIEKVRFLEQQNKALAAELNQLRAKEPTKLADVYQAELRELRLRLDQLTANSARLEVERDNLAQDLATVRQKLQDETNLRLEAENNLAAYRQEADEATLARLDLERKIESLEEEIRFLRKIHEEEVRELQEQLARQQVHVELDVAKPDLTAALKEIRTQYEAMASSNMHEAEEWYRSKFADLTDAAARNAELLRQAKHEANDYRRQLQSLTCDLESLRGTNESLERQMREQEERHVREAASYQEALARLEEEGQSLKDEMARHLQEYQDLLNVKLALDIEIATYRKLLEGEENRITIPVQTFSNLQIRETSLDTKSVSEGHLKRNIVVKTVEMRDGEVIKESKQEHKDVM'
-    glp1 = 'MAGAPGPLRLALLLLGMVGRAGPRPQGATVSLWETVQKWREYRRQCQRSLTEDPPPATDLFCNRTFDEYACWPDGEPGSFVNVSCPWYLPWASSVPQGHVYRFCTAEGLWLQKDNSSLPWRDLSECEESKRGERSSPEEQLLFLYIIYTVGYALSFSALVIASAILLGFRHLHCTRNYIHLNLFASFILRALSVFIKDAALKWMYSTAAQQHQWDGLLSYQDSLSCRLVFLLMQYCVAANYYWLLVEGVYLYTLLAFSVLSEQWIFRLYVSIGWGVPLLFVVPWGIVKYLYEDEGCWTRNSNMNYWLIIRLPILFAIGVNFLIFVRVICIVVSKLKANLMCKTDIKCRLAKSTLTLIPLLGTHEVIFAFVMDEHARGTLRFIKLFTELSFTSFQGLMVAILYCFVNNEVQLEFRKSWERWRLEHLHIQRDSSMKPLKCPTSSLSSGATAGSSMYTATCQASCS'
-    glast = 'MTKSNGEEPKMGGRMERFQQGVRKRTLLAKKKVQNITKEDVKSYLFRNAFVLLTVTAVIVGTILGFTLRPYRMSYREVKYFSFPGELLMRMLQMLVLPLIISSLVTGMAALDSKASGKMGMRAVVYYMTTTIIAVVIGIIIVIIIHPGKGTKENMHREGKIVRVTAADAFLDLIRNMFPPNLVEACFKQFKTNYEKRSFKVPIQANETLVGAVINNVSEAMETLTRITEELVPVPGSVNGVNALGLVVFSMCFGFVIGNMKEQGQALREFFDSLNEAIMRLVAVIMWYAPVGILFLIAGKIVEMEDMGVIGGQLAMYTVTVIVGLLIHAVIVLPLLYFLVTRKNPWVFIGGLLQALITALGTSSSSATLPITFKCLEENNGVDKRVTRFVLPVGATINMDGTALYEALAAIFIAQVNNFELNFGQIITISITATAASIGAAGIPQAGLVTMVIVLTSVGLPTDDITLIIAVDWFLDRLRTTTNVLGDSLGAGIVEHLSRHELKNRDVEMGNSVIEENEMKKPYQLIAQDNETEKPIDSETKM'
-    ncam = 'LQTKDLIWTLFFLGTAVSLQVDIVPSQGEISVGESKFFLCQVAGDAKDKDISWFSPNGEKLTPNQQRISVVWNDDSSSTLTIYNANIDDAGIYKCVVTGEDGSESEATVNVKIFQKLMFKNAPTPQEFREGEDAVIVCDVVSSLPPTIIWKHKGRDVILKKDVRFIVLSNNYLQIRGIKKTDEGTYRCEGRILARGEINFKDIQVIVNVPPTIQARQNIVNATANLGQSVTLVCDAEGFPEPTMSWTKDGEQIEQEEDDEKYIFSDDSSQLTIKKVDKNDEAEYICIAENKAGEQDATIHLKVFAKPKITYVENQTAMELEEQVTLTCEASGDPIPSITWRTSTRNISSEEKASWTRPEKQETLDGHMVVRSHARVSSLTLKSIQYTDAGEYICTASNTIGQDSQSMYLEVQYAPKLQGPVAVYTWEGNQVNITCEVFAYPSATISWFRDGQLLPSSNYSNIKIYNTPSASYLEVTPDSENDFGNYNCTAVNRIGQESLEFILVQADTPSSPSIDQVEPYSSTAQVQFDEPEATGGVPILKYKAEWRAVGEEVWHSKWYDAKEASMEGIVTIVGLKPETTYAVRLAALNGKGLGEISAASEF'
-    cereblon = 'MAGEGDQQDAAHNMGNHLPLLPAESEEEDEMEVEDQDSKEAKKPNIINFDTSLPTSHTYLGADMEEFHGRTLHDDDSCQVIPVLPQVMMILIPGQTLPLQLFHPQEVSMVRNLIQKDRTFAVLAYSNVQEREAQFGTTAEIYAYREEQDFGIEIVKVKAIGRQRFKVLELRTQSDGIQQAKVQILPECVLPSTMSAVQLESLNKCQIFPSKPVSREDQCSYKWWQKYQKRKFHCANLTSWPRWLYSLYDAETLMDRIKKQLREWDENLKDDSLPSNPIDFSYRVAACLPIDDVLRIQLLKIGSAIQRLRCELDIMNKCTSLCCKQCQETEITTKNEIFSLSLCGPMAAYVNPHGYVHETLTVYKACNLNLIGRPSTEHSWFPGYAWTVAQCKICASHIGWKFTATKKDMSPQKFWGLTRSALLPTIPDTEDEISPDKVILCL'
-    num_iter = 0
-    score_func_times = [0, 1, 2, 3, 4, 5]
-    scoring = ScoringFunctions(score_func_names=['binding_affinity1', 'solubility', 'hemolysis', 'nonfouling', 'permeability'], prot_seqs=[tfr])
-    smiles = ['N2[C@H](CC(C)C)C(=O)N1[C@@H](CCC1)C(=O)N1[C@@H](CCC1)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](Cc1ccccc1C(F)(F)F)C(=O)N1[C@@H](CCC1)C(=O)N[C@@H](CCSC)C(=O)N[C@@H](CC1=CN=C-N1)C(=O)N[C@@H](CCC(=O)O)C(=O)N[C@@H](CCCNC(=N)N)C(=O)N[C@@H](CC(=O)N)C2(=O)']
-    scores = scoring(input_seqs=smiles)
-    print(scores)
-    print(len(scores))
-if __name__ == '__main__':
-    unittest()