Spaces:
Running
Running
Commit
Β·
05f80db
1
Parent(s):
b5075d2
update
Browse files- app.py +4 -4
- description.md +9 -6
- tokenizer/new_vocab.txt +0 -1
- training_data/half_life_smiles.csv +0 -3
- training_data/hemo-negative.npz +0 -3
- training_data/hemo-positive.npz +0 -3
- training_data/nf-negative.npz +0 -3
- training_data/nf-positive.npz +0 -3
- training_data/sol-negative.npz +0 -3
- training_data/sol-positive.npz +0 -3
- training_data/toxicity_train.csv +0 -0
- training_data/toxicity_val.csv +0 -0
app.py
CHANGED
|
@@ -837,8 +837,8 @@ class UnifiedPeptidePredictor:
|
|
| 837 |
'path': 'models/best_model_nonfouling.json',
|
| 838 |
'unit': 'Probability',
|
| 839 |
'display_name': 'π― Non-Fouling',
|
| 840 |
-
'positive_label': 'Non-
|
| 841 |
-
'negative_label': '
|
| 842 |
},
|
| 843 |
'nonfouling_smiles': {
|
| 844 |
'type': 'xgboost',
|
|
@@ -846,8 +846,8 @@ class UnifiedPeptidePredictor:
|
|
| 846 |
'path': 'models/nonfouling-xgboost_smiles.json',
|
| 847 |
'unit': 'Probability',
|
| 848 |
'display_name': 'π― Non-Fouling',
|
| 849 |
-
'positive_label': '
|
| 850 |
-
'negative_label': '
|
| 851 |
},
|
| 852 |
'binding_affinity': {
|
| 853 |
'type': 'binding',
|
|
|
|
| 837 |
'path': 'models/best_model_nonfouling.json',
|
| 838 |
'unit': 'Probability',
|
| 839 |
'display_name': 'π― Non-Fouling',
|
| 840 |
+
'positive_label': 'Non-fouling',
|
| 841 |
+
'negative_label': 'Fouling'
|
| 842 |
},
|
| 843 |
'nonfouling_smiles': {
|
| 844 |
'type': 'xgboost',
|
|
|
|
| 846 |
'path': 'models/nonfouling-xgboost_smiles.json',
|
| 847 |
'unit': 'Probability',
|
| 848 |
'display_name': 'π― Non-Fouling',
|
| 849 |
+
'positive_label': 'Non-fouling',
|
| 850 |
+
'negative_label': 'Fouling'
|
| 851 |
},
|
| 852 |
'binding_affinity': {
|
| 853 |
'type': 'binding',
|
description.md
CHANGED
|
@@ -9,34 +9,34 @@ Our models are trained on curated datasets from multiple sources:
|
|
| 9 |
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 10 |
- **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
|
| 11 |
- **Description:** Probability of peptide disrupting red blood cell membranes.
|
| 12 |
-
- **Download:** [
|
| 13 |
|
| 14 |
#### Solubility Dataset
|
| 15 |
- **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
|
| 16 |
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 17 |
- **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
|
| 18 |
- **Description:** Probability of peptide remaining dissolved in aqueous conditions.
|
| 19 |
-
- **Download:** [
|
| 20 |
|
| 21 |
#### Non-Fouling Dataset
|
| 22 |
- **Primary Source:** [Classifying antimicrobial and multifunctional peptides with Bayesian network models](https://doi.org/10.1002/pep2.24079)
|
| 23 |
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 24 |
- **Size:** 3,600 positive, 13,585 negative
|
| 25 |
- **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
|
| 26 |
-
- **Download:** [
|
| 27 |
|
| 28 |
#### Permeability Dataset
|
| 29 |
- **Primary Source:** [CycPeptMPDB](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.2c01573), [PAMPA](https://doi.org/10.1517/17425255.1.2.325)
|
| 30 |
- **Secondary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
|
| 31 |
- **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
|
| 32 |
- **Description:** Probability of peptide penetrating the cell membrane.
|
| 33 |
-
- **Download:** [
|
| 34 |
|
| 35 |
#### Half-life Dataset
|
| 36 |
- **Primary Source:** [Thpdb2](https://doi.org/10.1016/j.drudis.2024.104047), [PepTherDia](https://doi.org/10.1016/j.drudis.2021.02.019), [peplife](https://www.nature.com/articles/srep36617)
|
| 37 |
- **Size:** 105 wt, 275 wt+noncanonical, human-only
|
| 38 |
- **Clean-ups:** Data are all transformed into log\(hour\)
|
| 39 |
-
- **Download:** [binding_affinity_training_data.csv](
|
| 40 |
|
| 41 |
|
| 42 |
#### Binding Affinity Dataset
|
|
@@ -44,7 +44,7 @@ Our models are trained on curated datasets from multiple sources:
|
|
| 44 |
- **Size:** 1,781 protein-peptide complexes, canonical and non-canonical
|
| 45 |
- **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
|
| 46 |
- **Quality:** Binding class cutoffs: Tight β₯ 7.5, Medium 6.0β7.5, Weak < 6.0
|
| 47 |
-
- **Download:** [binding_affinity_training_data.csv](
|
| 48 |
|
| 49 |
### Model Architecture
|
| 50 |
|
|
@@ -53,6 +53,9 @@ Our models are trained on curated datasets from multiple sources:
|
|
| 53 |
- **CNN Models:** 1D convolutional networks with attention mechanisms
|
| 54 |
- **Binding Model:** Cross-attention between protein and peptide representations
|
| 55 |
|
|
|
|
|
|
|
|
|
|
| 56 |
### Citation
|
| 57 |
|
| 58 |
If you use this tool, please cite:
|
|
|
|
| 9 |
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 10 |
- **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
|
| 11 |
- **Description:** Probability of peptide disrupting red blood cell membranes.
|
| 12 |
+
- **Download:** [hemo-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/hemo-positive.npz)
|
| 13 |
|
| 14 |
#### Solubility Dataset
|
| 15 |
- **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
|
| 16 |
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 17 |
- **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
|
| 18 |
- **Description:** Probability of peptide remaining dissolved in aqueous conditions.
|
| 19 |
+
- **Download:** [sol-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/sol-positive.npz)
|
| 20 |
|
| 21 |
#### Non-Fouling Dataset
|
| 22 |
- **Primary Source:** [Classifying antimicrobial and multifunctional peptides with Bayesian network models](https://doi.org/10.1002/pep2.24079)
|
| 23 |
- **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
|
| 24 |
- **Size:** 3,600 positive, 13,585 negative
|
| 25 |
- **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
|
| 26 |
+
- **Download:** [nf-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/nf-positive.npz)
|
| 27 |
|
| 28 |
#### Permeability Dataset
|
| 29 |
- **Primary Source:** [CycPeptMPDB](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.2c01573), [PAMPA](https://doi.org/10.1517/17425255.1.2.325)
|
| 30 |
- **Secondary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
|
| 31 |
- **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
|
| 32 |
- **Description:** Probability of peptide penetrating the cell membrane.
|
| 33 |
+
- **Download:** [nc-CPP-processed.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/nc-CPP-processed.csv)
|
| 34 |
|
| 35 |
#### Half-life Dataset
|
| 36 |
- **Primary Source:** [Thpdb2](https://doi.org/10.1016/j.drudis.2024.104047), [PepTherDia](https://doi.org/10.1016/j.drudis.2021.02.019), [peplife](https://www.nature.com/articles/srep36617)
|
| 37 |
- **Size:** 105 wt, 275 wt+noncanonical, human-only
|
| 38 |
- **Clean-ups:** Data are all transformed into log\(hour\)
|
| 39 |
+
- **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/half_life_smiles.csv)
|
| 40 |
|
| 41 |
|
| 42 |
#### Binding Affinity Dataset
|
|
|
|
| 44 |
- **Size:** 1,781 protein-peptide complexes, canonical and non-canonical
|
| 45 |
- **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
|
| 46 |
- **Quality:** Binding class cutoffs: Tight β₯ 7.5, Medium 6.0β7.5, Weak < 6.0
|
| 47 |
+
- **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/c-binding.csv)
|
| 48 |
|
| 49 |
### Model Architecture
|
| 50 |
|
|
|
|
| 53 |
- **CNN Models:** 1D convolutional networks with attention mechanisms
|
| 54 |
- **Binding Model:** Cross-attention between protein and peptide representations
|
| 55 |
|
| 56 |
+
### Model Training and Weight Hosting
|
| 57 |
+
- [Classifier_weights](https://huggingface.co/ChatterjeeLab/Classifier_Weight)
|
| 58 |
+
|
| 59 |
### Citation
|
| 60 |
|
| 61 |
If you use this tool, please cite:
|
tokenizer/new_vocab.txt
CHANGED
|
@@ -92,7 +92,6 @@ b
|
|
| 92 |
c
|
| 93 |
cc
|
| 94 |
ccc
|
| 95 |
-
cccc
|
| 96 |
ccn
|
| 97 |
cco
|
| 98 |
ccs
|
|
|
|
| 92 |
c
|
| 93 |
cc
|
| 94 |
ccc
|
|
|
|
| 95 |
ccn
|
| 96 |
cco
|
| 97 |
ccs
|
training_data/half_life_smiles.csv
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:d90293170442bc81af2cf9f64656c40bf884733947ca52b2f9308f42220680c3
|
| 3 |
-
size 174323
|
|
|
|
|
|
|
|
|
|
|
|
training_data/hemo-negative.npz
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:f83aad41f160deb6401bc0801bddc931488da6e1785749e6f72de6d0f154a37f
|
| 3 |
-
size 109451
|
|
|
|
|
|
|
|
|
|
|
|
training_data/hemo-positive.npz
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:96cb24d5a7617f7e211cd48d2b0b424a46affa95716b96058058902068068d27
|
| 3 |
-
size 27840
|
|
|
|
|
|
|
|
|
|
|
|
training_data/nf-negative.npz
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:e506e52e101308dd3882ca6bd45833a6e0837f9f240aa85d575c2a41e305b854
|
| 3 |
-
size 21845190
|
|
|
|
|
|
|
|
|
|
|
|
training_data/nf-positive.npz
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:78caae183fe840b145275d9486a3f94a963989deb9d55a57995653bf1d497bf2
|
| 3 |
-
size 41326
|
|
|
|
|
|
|
|
|
|
|
|
training_data/sol-negative.npz
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:c3b6d380024e0483e15e3e219a7cbf23f4d178d823287cef24bc1bd918a817b6
|
| 3 |
-
size 15469064
|
|
|
|
|
|
|
|
|
|
|
|
training_data/sol-positive.npz
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:46169267fd0d37d8a063a4e9fc1cdd9b701a9211b1f16515e3d569fcf2d4d859
|
| 3 |
-
size 14056264
|
|
|
|
|
|
|
|
|
|
|
|
training_data/toxicity_train.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|
training_data/toxicity_val.csv
DELETED
|
The diff for this file is too large to render.
See raw diff
|
|
|