eachanjohnson commited on
Commit
20b47ac
·
verified ·
1 Parent(s): bbec84a

Upload folder using huggingface_hub

Browse files
README.md CHANGED
@@ -12,7 +12,7 @@ datasets:
12
 
13
  # Predictor of _Yersinia pestis_ MICs
14
 
15
- _Updated:_ cd ..Tue Apr 1 03:31:50 BST 2025
16
 
17
  Trained on the _Yersinia pestis_, WT accumulator phenotype subset of the [human-curated SPARK dataset](https://doi.org/10.1021/acsinfecdis.8b00193) (10002 rows in total for _Yersinia pestis_).
18
 
@@ -26,7 +26,7 @@ Duvida also saves the training data in this checkpoint to allows the calculation
26
  based on that training data.
27
 
28
  This model is the best regression model from a hyperparameter search, determined
29
- by Spearman's $\rho$ on a held-out test set not used in training or early stopping.
30
 
31
  ### Model architecture
32
 
@@ -35,13 +35,13 @@ by Spearman's $\rho$ on a held-out test set not used in training or early stoppi
35
  ```json
36
 
37
  {
38
- "dropout": 0.2,
39
  "ensemble_size": 3,
40
  "extra_featurizers": null,
41
- "learning_rate": 1e-05,
42
- "model_class": "FPMLPModelBox",
43
- "n_hidden": 5,
44
- "n_units": 64,
45
  "use_2d": true,
46
  "use_fp": true
47
  }
@@ -79,39 +79,39 @@ Train (7002 rows):
79
  ```json
80
 
81
  {
82
- "Pearson r": 0.8172829331255921,
83
- "RMSE": 0.031726885586977005,
84
- "Spearman rho": 0.9709886269987218
85
  }
86
  ```
87
 
88
- <img src="predictions-train.png" width=450>
89
 
90
  Validation (1499 rows):
91
 
92
  ```json
93
 
94
  {
95
- "Pearson r": 0.722565134551274,
96
- "RMSE": 0.038165923207998276,
97
- "Spearman rho": 0.9247149302918276
98
  }
99
  ```
100
 
101
- <img src="predictions-validation.png" width=450>
102
 
103
  Test (1501 rows):
104
 
105
  ```json
106
 
107
  {
108
- "Pearson r": 0.7763133307532436,
109
- "RMSE": 0.035939376801252365,
110
- "Spearman rho": 0.9701919932392147
111
  }
112
  ```
113
 
114
- <img src="predictions-test.png" width=450>
115
 
116
  ## Training data details
117
 
 
12
 
13
  # Predictor of _Yersinia pestis_ MICs
14
 
15
+ _Updated:_ Tue 1 Apr 08:03:01 BST 2025
16
 
17
  Trained on the _Yersinia pestis_, WT accumulator phenotype subset of the [human-curated SPARK dataset](https://doi.org/10.1021/acsinfecdis.8b00193) (10002 rows in total for _Yersinia pestis_).
18
 
 
26
  based on that training data.
27
 
28
  This model is the best regression model from a hyperparameter search, determined
29
+ by Pearson's $$r$$ on a held-out test set not used in training or early stopping.
30
 
31
  ### Model architecture
32
 
 
35
  ```json
36
 
37
  {
38
+ "dropout": 0.0,
39
  "ensemble_size": 3,
40
  "extra_featurizers": null,
41
+ "learning_rate": 0.0001,
42
+ "model_class": "ChempropModelBox",
43
+ "n_hidden": 1,
44
+ "n_units": 256,
45
  "use_2d": true,
46
  "use_fp": true
47
  }
 
79
  ```json
80
 
81
  {
82
+ "Pearson r": 0.9997487398096949,
83
+ "RMSE": 0.001503719948232174,
84
+ "Spearman rho": 0.9995015503182805
85
  }
86
  ```
87
 
88
+ <img src="predictions_train.png" width=450>
89
 
90
  Validation (1499 rows):
91
 
92
  ```json
93
 
94
  {
95
+ "Pearson r": 0.7851248408551393,
96
+ "RMSE": 0.019412975758314133,
97
+ "Spearman rho": 0.934205549109032
98
  }
99
  ```
100
 
101
+ <img src="predictions_validation.png" width=450>
102
 
103
  Test (1501 rows):
104
 
105
  ```json
106
 
107
  {
108
+ "Pearson r": 0.7895561902856947,
109
+ "RMSE": 0.016430044546723366,
110
+ "Spearman rho": 0.9670287095583492
111
  }
112
  ```
113
 
114
+ <img src="predictions_test.png" width=450>
115
 
116
  ## Training data details
117
 
data-load-args.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "cache": "/nemo/lab/johnsone/home/users/johnsoe/projects/abx-discovery-strategy/models/spark-dv-2503/Yersinia-pestis/117/cache",
3
  "features": [
4
  "smiles"
5
  ],
 
1
  {
2
+ "cache": "/nemo/lab/johnsone/home/users/johnsoe/projects/abx-discovery-strategy/models/spark-dv-2503/Yersinia-pestis/79/cache",
3
  "features": [
4
  "smiles"
5
  ],
eval-metrics_test.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "Pearson r": 0.7763133307532436,
3
- "RMSE": 0.035939376801252365,
4
- "Spearman rho": 0.9701919932392147
5
  }
 
1
  {
2
+ "Pearson r": 0.7895561902856947,
3
+ "RMSE": 0.016430044546723366,
4
+ "Spearman rho": 0.9670287095583492
5
  }
eval-metrics_train.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "Pearson r": 0.8172829331255921,
3
- "RMSE": 0.031726885586977005,
4
- "Spearman rho": 0.9709886269987218
5
  }
 
1
  {
2
+ "Pearson r": 0.9997487398096949,
3
+ "RMSE": 0.001503719948232174,
4
+ "Spearman rho": 0.9995015503182805
5
  }
eval-metrics_validation.json CHANGED
@@ -1,5 +1,5 @@
1
  {
2
- "Pearson r": 0.722565134551274,
3
- "RMSE": 0.038165923207998276,
4
- "Spearman rho": 0.9247149302918276
5
  }
 
1
  {
2
+ "Pearson r": 0.7851248408551393,
3
+ "RMSE": 0.019412975758314133,
4
+ "Spearman rho": 0.934205549109032
5
  }
logs-csv/lightning_logs/version_0/hparams.yaml CHANGED
@@ -1,13 +1 @@
1
- dropout: 0.2
2
- ensemble_size: 3
3
- extra_featurizers: null
4
- learning_rate: 1.0e-05
5
- n_hidden: 5
6
- n_input: 2248
7
- n_out: 1
8
- n_units: 64
9
- optimizer: !!python/name:torch.optim.adam.Adam ''
10
- reduce_lr_on_plateau: true
11
- reduce_lr_patience: 10
12
- use_2d: true
13
- use_fp: true
 
1
+ {}
 
 
 
 
 
 
 
 
 
 
 
 
logs-csv/lightning_logs/version_0/metrics.csv CHANGED
The diff for this file is too large to render. See raw diff
 
logs/lightning_logs/version_0/events.out.tfevents.1743276990.cn075.1752123.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f566aa633d483ff740936c359ce37e185ed3664969e3b4f2d28e44dd63021506
3
+ size 531562
logs/lightning_logs/version_0/hparams.yaml CHANGED
@@ -1,13 +1 @@
1
- dropout: 0.2
2
- ensemble_size: 3
3
- extra_featurizers: null
4
- learning_rate: 1.0e-05
5
- n_hidden: 5
6
- n_input: 2248
7
- n_out: 1
8
- n_units: 64
9
- optimizer: !!python/name:torch.optim.adam.Adam ''
10
- reduce_lr_on_plateau: true
11
- reduce_lr_patience: 10
12
- use_2d: true
13
- use_fp: true
 
1
+ {}
 
 
 
 
 
 
 
 
 
 
 
 
metrics.csv CHANGED
@@ -1,4 +1,4 @@
1
  split,split_filename,config_i,model_class,n_parameters,filename,features,labels,cache,extra_featurizers,use_2d,use_fp,dropout,ensemble_size,learning_rate,n_hidden,n_units,val_filename,epochs,batch_size,RMSE,Pearson r,Spearman rho
2
- train,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-train.csv.gz,117,FPMLPModelBox,481923,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-train.csv.gz,['smiles'],['pmic'],/nemo/lab/johnsone/home/users/johnsoe/projects/abx-discovery-strategy/models/spark-dv-2503/Yersinia-pestis/117/cache,,True,True,0.2,3,1e-05,5,64,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-validation.csv.gz,10000,16,0.031726885586977005,0.8172829331255921,0.9709886269987218
3
- validation,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-validation.csv.gz,117,FPMLPModelBox,481923,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-train.csv.gz,['smiles'],['pmic'],/nemo/lab/johnsone/home/users/johnsoe/projects/abx-discovery-strategy/models/spark-dv-2503/Yersinia-pestis/117/cache,,True,True,0.2,3,1e-05,5,64,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-validation.csv.gz,10000,16,0.038165923207998276,0.722565134551274,0.9247149302918276
4
- test,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-test.csv.gz,117,FPMLPModelBox,481923,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-train.csv.gz,['smiles'],['pmic'],/nemo/lab/johnsone/home/users/johnsoe/projects/abx-discovery-strategy/models/spark-dv-2503/Yersinia-pestis/117/cache,,True,True,0.2,3,1e-05,5,64,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-validation.csv.gz,10000,16,0.035939376801252365,0.7763133307532436,0.9701919932392147
 
1
  split,split_filename,config_i,model_class,n_parameters,filename,features,labels,cache,extra_featurizers,use_2d,use_fp,dropout,ensemble_size,learning_rate,n_hidden,n_units,val_filename,epochs,batch_size,RMSE,Pearson r,Spearman rho
2
+ train,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-train.csv.gz,79,ChempropModelBox,2641503,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-train.csv.gz,['smiles'],['pmic'],/nemo/lab/johnsone/home/users/johnsoe/projects/abx-discovery-strategy/models/spark-dv-2503/Yersinia-pestis/79/cache,,True,True,0.0,3,0.0001,1,256,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-validation.csv.gz,10000,16,0.001503719948232174,0.9997487398096949,0.9995015503182805
3
+ validation,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-validation.csv.gz,79,ChempropModelBox,2641503,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-train.csv.gz,['smiles'],['pmic'],/nemo/lab/johnsone/home/users/johnsoe/projects/abx-discovery-strategy/models/spark-dv-2503/Yersinia-pestis/79/cache,,True,True,0.0,3,0.0001,1,256,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-validation.csv.gz,10000,16,0.019412975758314133,0.7851248408551393,0.934205549109032
4
+ test,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-test.csv.gz,79,ChempropModelBox,2641503,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-train.csv.gz,['smiles'],['pmic'],/nemo/lab/johnsone/home/users/johnsoe/projects/abx-discovery-strategy/models/spark-dv-2503/Yersinia-pestis/79/cache,,True,True,0.0,3,0.0001,1,256,/nemo/lab/johnsone/home/users/johnsoe/data/datasets/thomas-2018-spark-wt/Yersinia-pestis/scaffold-split-validation.csv.gz,10000,16,0.016430044546723366,0.7895561902856947,0.9670287095583492
modelbox-config.json CHANGED
@@ -1,11 +1,11 @@
1
  {
2
- "dropout": 0.2,
3
  "ensemble_size": 3,
4
  "extra_featurizers": null,
5
- "learning_rate": 1e-05,
6
- "model_class": "FPMLPModelBox",
7
- "n_hidden": 5,
8
- "n_units": 64,
9
  "use_2d": true,
10
  "use_fp": true
11
  }
 
1
  {
2
+ "dropout": 0.0,
3
  "ensemble_size": 3,
4
  "extra_featurizers": null,
5
+ "learning_rate": 0.0001,
6
+ "model_class": "ChempropModelBox",
7
+ "n_hidden": 1,
8
+ "n_units": 256,
9
  "use_2d": true,
10
  "use_fp": true
11
  }
params.pt CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:ee16fefc5cc589f4e71428f11f98164cd55805c9fd9d5e06a283979b46465670
3
- size 1947266
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7ce837e25980d811d35ae69dfa284bb6e0e0781aa2cfc61db0d304898cc0bc40
3
+ size 10591438
predictions_test.csv.gz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:a7c727a4bf844df723fd5f3795e18cad06b5d022e79916020fa624228c572f38
3
- size 37895
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:be3f965a781921dc8e800478db924ed2d0611171cae0cbcb0afe91bd755e9a7f
3
+ size 1496218
predictions_test.png CHANGED
predictions_train.csv.gz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:8c0ed7b3d692d0061c30bff80b6c9f01403b123bc440a0fd7d37b8604e7fc51b
3
- size 147876
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d941f5a5e6811308e8b132b3d134cea1b35f9569048d3507f0ebefe8be659919
3
+ size 6112338
predictions_train.png CHANGED
predictions_validation.csv.gz CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:5e49d722bbcc0f7971ca99fa787d558c669c97378ff5bb4d8d2ab41343726cd6
3
- size 36285
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b1daae2036610439d9ea825c67629c9499a1fa75ec049ff8d212bb3513a760c1
3
+ size 1428218
predictions_validation.png CHANGED
training-data.hf/cache-be42d5605d12b0e6.arrow ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0e3969c11dc8a66b325abfc4677833ab3fb0f19d5df6f159e6f38be7f7204b71
3
+ size 196560040
training-data.hf/data-00000-of-00001.arrow CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:db76da8bf189fcd4b2421108c5534d504b8d09606b4361f62d45689219923beb
3
- size 126338936
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:249c80ca1a8facf29ab36711a3baf6d535684f98faa309662d225d6de99992d9
3
+ size 195855616
training-data.hf/dataset_info.json CHANGED
@@ -14,19 +14,93 @@
14
  "download_size": 559702,
15
  "features": {
16
  "smiles": {
17
- "dtype": "string",
18
- "_type": "Value"
 
 
 
19
  },
20
  "inputs": {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
21
  "feature": {
22
  "dtype": "float64",
23
  "_type": "Value"
24
  },
25
  "_type": "Sequence"
26
  },
27
- "labels": {
28
  "feature": {
29
- "dtype": "float64",
30
  "_type": "Value"
31
  },
32
  "_type": "Sequence"
 
14
  "download_size": 559702,
15
  "features": {
16
  "smiles": {
17
+ "feature": {
18
+ "dtype": "string",
19
+ "_type": "Value"
20
+ },
21
+ "_type": "Sequence"
22
  },
23
  "inputs": {
24
+ "V_d": {
25
+ "dtype": "null",
26
+ "_type": "Value"
27
+ },
28
+ "gt_mask": {
29
+ "dtype": "null",
30
+ "_type": "Value"
31
+ },
32
+ "lt_mask": {
33
+ "dtype": "null",
34
+ "_type": "Value"
35
+ },
36
+ "mg": {
37
+ "E": {
38
+ "feature": {
39
+ "feature": {
40
+ "dtype": "float32",
41
+ "_type": "Value"
42
+ },
43
+ "_type": "Sequence"
44
+ },
45
+ "_type": "Sequence"
46
+ },
47
+ "V": {
48
+ "feature": {
49
+ "feature": {
50
+ "dtype": "float32",
51
+ "_type": "Value"
52
+ },
53
+ "_type": "Sequence"
54
+ },
55
+ "_type": "Sequence"
56
+ },
57
+ "edge_index": {
58
+ "feature": {
59
+ "feature": {
60
+ "dtype": "float32",
61
+ "_type": "Value"
62
+ },
63
+ "_type": "Sequence"
64
+ },
65
+ "_type": "Sequence"
66
+ },
67
+ "rev_edge_index": {
68
+ "feature": {
69
+ "dtype": "float32",
70
+ "_type": "Value"
71
+ },
72
+ "_type": "Sequence"
73
+ }
74
+ },
75
+ "weight": {
76
+ "dtype": "float32",
77
+ "_type": "Value"
78
+ },
79
+ "x_d": {
80
+ "feature": {
81
+ "dtype": "float32",
82
+ "_type": "Value"
83
+ },
84
+ "_type": "Sequence"
85
+ },
86
+ "y": {
87
+ "feature": {
88
+ "dtype": "float32",
89
+ "_type": "Value"
90
+ },
91
+ "_type": "Sequence"
92
+ }
93
+ },
94
+ "labels": {
95
  "feature": {
96
  "dtype": "float64",
97
  "_type": "Value"
98
  },
99
  "_type": "Sequence"
100
  },
101
+ "extra_features": {
102
  "feature": {
103
+ "dtype": "float32",
104
  "_type": "Value"
105
  },
106
  "_type": "Sequence"
training-data.hf/state.json CHANGED
@@ -4,7 +4,7 @@
4
  "filename": "data-00000-of-00001.arrow"
5
  }
6
  ],
7
- "_fingerprint": "187fee60d2df35f6",
8
  "_format_columns": null,
9
  "_format_kwargs": {
10
  "dtype": "float"
 
4
  "filename": "data-00000-of-00001.arrow"
5
  }
6
  ],
7
+ "_fingerprint": "0f7055de735ed62e",
8
  "_format_columns": null,
9
  "_format_kwargs": {
10
  "dtype": "float"
training-log.csv CHANGED
The diff for this file is too large to render. See raw diff
 
training-log.png CHANGED