yinuozhang commited on
Commit
05f80db
Β·
1 Parent(s): b5075d2
app.py CHANGED
@@ -837,8 +837,8 @@ class UnifiedPeptidePredictor:
837
  'path': 'models/best_model_nonfouling.json',
838
  'unit': 'Probability',
839
  'display_name': 'πŸ‘― Non-Fouling',
840
- 'positive_label': 'Non-toxic',
841
- 'negative_label': 'Toxic'
842
  },
843
  'nonfouling_smiles': {
844
  'type': 'xgboost',
@@ -846,8 +846,8 @@ class UnifiedPeptidePredictor:
846
  'path': 'models/nonfouling-xgboost_smiles.json',
847
  'unit': 'Probability',
848
  'display_name': 'πŸ‘― Non-Fouling',
849
- 'positive_label': 'Stable',
850
- 'negative_label': 'Unstable'
851
  },
852
  'binding_affinity': {
853
  'type': 'binding',
 
837
  'path': 'models/best_model_nonfouling.json',
838
  'unit': 'Probability',
839
  'display_name': 'πŸ‘― Non-Fouling',
840
+ 'positive_label': 'Non-fouling',
841
+ 'negative_label': 'Fouling'
842
  },
843
  'nonfouling_smiles': {
844
  'type': 'xgboost',
 
846
  'path': 'models/nonfouling-xgboost_smiles.json',
847
  'unit': 'Probability',
848
  'display_name': 'πŸ‘― Non-Fouling',
849
+ 'positive_label': 'Non-fouling',
850
+ 'negative_label': 'Fouling'
851
  },
852
  'binding_affinity': {
853
  'type': 'binding',
description.md CHANGED
@@ -9,34 +9,34 @@ Our models are trained on curated datasets from multiple sources:
9
  - **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
10
  - **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
11
  - **Description:** Probability of peptide disrupting red blood cell membranes.
12
- - **Download:** [hemolysis_training_data.csv](#)
13
 
14
  #### Solubility Dataset
15
  - **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
16
  - **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
17
  - **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
18
  - **Description:** Probability of peptide remaining dissolved in aqueous conditions.
19
- - **Download:** [solubility_training_data.csv](#)
20
 
21
  #### Non-Fouling Dataset
22
  - **Primary Source:** [Classifying antimicrobial and multifunctional peptides with Bayesian network models](https://doi.org/10.1002/pep2.24079)
23
  - **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
24
  - **Size:** 3,600 positive, 13,585 negative
25
  - **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
26
- - **Download:** [solubility_training_data.csv](#)
27
 
28
  #### Permeability Dataset
29
  - **Primary Source:** [CycPeptMPDB](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.2c01573), [PAMPA](https://doi.org/10.1517/17425255.1.2.325)
30
  - **Secondary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
31
  - **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
32
  - **Description:** Probability of peptide penetrating the cell membrane.
33
- - **Download:** [binding_affinity_training_data.csv](#)
34
 
35
  #### Half-life Dataset
36
  - **Primary Source:** [Thpdb2](https://doi.org/10.1016/j.drudis.2024.104047), [PepTherDia](https://doi.org/10.1016/j.drudis.2021.02.019), [peplife](https://www.nature.com/articles/srep36617)
37
  - **Size:** 105 wt, 275 wt+noncanonical, human-only
38
  - **Clean-ups:** Data are all transformed into log\(hour\)
39
- - **Download:** [binding_affinity_training_data.csv](#)
40
 
41
 
42
  #### Binding Affinity Dataset
@@ -44,7 +44,7 @@ Our models are trained on curated datasets from multiple sources:
44
  - **Size:** 1,781 protein-peptide complexes, canonical and non-canonical
45
  - **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
46
  - **Quality:** Binding class cutoffs: Tight β‰₯ 7.5, Medium 6.0–7.5, Weak < 6.0
47
- - **Download:** [binding_affinity_training_data.csv](#)
48
 
49
  ### Model Architecture
50
 
@@ -53,6 +53,9 @@ Our models are trained on curated datasets from multiple sources:
53
  - **CNN Models:** 1D convolutional networks with attention mechanisms
54
  - **Binding Model:** Cross-attention between protein and peptide representations
55
 
 
 
 
56
  ### Citation
57
 
58
  If you use this tool, please cite:
 
9
  - **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
10
  - **Size:** 9,316 peptides, with 19.6% being positive (hemolytic) and 80.4% being negative (nonhemolytic)
11
  - **Description:** Probability of peptide disrupting red blood cell membranes.
12
+ - **Download:** [hemo-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/hemo-positive.npz)
13
 
14
  #### Solubility Dataset
15
  - **Primary Source:** [PROSO-II](https://febs.onlinelibrary.wiley.com/doi/abs/10.1111/j.1742-4658.2012.08603.x)
16
  - **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
17
  - **Size:** 18,453 sequences, with 47.6% being labeled as positives and 52.4% being labeled as negatives
18
  - **Description:** Probability of peptide remaining dissolved in aqueous conditions.
19
+ - **Download:** [sol-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/sol-positive.npz)
20
 
21
  #### Non-Fouling Dataset
22
  - **Primary Source:** [Classifying antimicrobial and multifunctional peptides with Bayesian network models](https://doi.org/10.1002/pep2.24079)
23
  - **Secondary Source:** [peptideBERT](https://pubs.acs.org/doi/abs/10.1021/acs.jpclett.3c02398)
24
  - **Size:** 3,600 positive, 13,585 negative
25
  - **Description:** A nonfouling peptide resists nonspecific interactions and protein adsorption.
26
+ - **Download:** [nf-positive.npz](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/nf-positive.npz)
27
 
28
  #### Permeability Dataset
29
  - **Primary Source:** [CycPeptMPDB](https://pubs.acs.org/doi/abs/10.1021/acs.jcim.2c01573), [PAMPA](https://doi.org/10.1517/17425255.1.2.325)
30
  - **Secondary Source:** [PepLand](https://arxiv.org/abs/2311.04419)
31
  - **Size:** 1162 positive and negative for nanonical samples each (22 relevant cell-penetrating peptide databases by compiling literature on existing cell-penetrating peptide prediction models ); CycPeptMPDB provides extra 7334 cyclic peptides
32
  - **Description:** Probability of peptide penetrating the cell membrane.
33
+ - **Download:** [nc-CPP-processed.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/nc-CPP-processed.csv)
34
 
35
  #### Half-life Dataset
36
  - **Primary Source:** [Thpdb2](https://doi.org/10.1016/j.drudis.2024.104047), [PepTherDia](https://doi.org/10.1016/j.drudis.2021.02.019), [peplife](https://www.nature.com/articles/srep36617)
37
  - **Size:** 105 wt, 275 wt+noncanonical, human-only
38
  - **Clean-ups:** Data are all transformed into log\(hour\)
39
+ - **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/half_life_smiles.csv)
40
 
41
 
42
  #### Binding Affinity Dataset
 
44
  - **Size:** 1,781 protein-peptide complexes, canonical and non-canonical
45
  - **Description:** Binding probability normalized in PepLand already. It's a combination of IC50/EC50.
46
  - **Quality:** Binding class cutoffs: Tight β‰₯ 7.5, Medium 6.0–7.5, Weak < 6.0
47
+ - **Download:** [binding_affinity_training_data.csv](https://huggingface.co/ChatterjeeLab/Classifier_Weight/blob/main/training_data/c-binding.csv)
48
 
49
  ### Model Architecture
50
 
 
53
  - **CNN Models:** 1D convolutional networks with attention mechanisms
54
  - **Binding Model:** Cross-attention between protein and peptide representations
55
 
56
+ ### Model Training and Weight Hosting
57
+ - [Classifier_weights](https://huggingface.co/ChatterjeeLab/Classifier_Weight)
58
+
59
  ### Citation
60
 
61
  If you use this tool, please cite:
tokenizer/new_vocab.txt CHANGED
@@ -92,7 +92,6 @@ b
92
  c
93
  cc
94
  ccc
95
- cccc
96
  ccn
97
  cco
98
  ccs
 
92
  c
93
  cc
94
  ccc
 
95
  ccn
96
  cco
97
  ccs
training_data/half_life_smiles.csv DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:d90293170442bc81af2cf9f64656c40bf884733947ca52b2f9308f42220680c3
3
- size 174323
 
 
 
 
training_data/hemo-negative.npz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:f83aad41f160deb6401bc0801bddc931488da6e1785749e6f72de6d0f154a37f
3
- size 109451
 
 
 
 
training_data/hemo-positive.npz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:96cb24d5a7617f7e211cd48d2b0b424a46affa95716b96058058902068068d27
3
- size 27840
 
 
 
 
training_data/nf-negative.npz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:e506e52e101308dd3882ca6bd45833a6e0837f9f240aa85d575c2a41e305b854
3
- size 21845190
 
 
 
 
training_data/nf-positive.npz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:78caae183fe840b145275d9486a3f94a963989deb9d55a57995653bf1d497bf2
3
- size 41326
 
 
 
 
training_data/sol-negative.npz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:c3b6d380024e0483e15e3e219a7cbf23f4d178d823287cef24bc1bd918a817b6
3
- size 15469064
 
 
 
 
training_data/sol-positive.npz DELETED
@@ -1,3 +0,0 @@
1
- version https://git-lfs.github.com/spec/v1
2
- oid sha256:46169267fd0d37d8a063a4e9fc1cdd9b701a9211b1f16515e3d569fcf2d4d859
3
- size 14056264
 
 
 
 
training_data/toxicity_train.csv DELETED
The diff for this file is too large to render. See raw diff
 
training_data/toxicity_val.csv DELETED
The diff for this file is too large to render. See raw diff