natelgrw commited on
Commit
e966be6
·
1 Parent(s): cbecb06

initial commit

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
AMAX_MLP1/AMAX_MLP1.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf9816fdd6f1243656b8553a8f3f32caf288ed0928f10927939ba4cf53bbc0e0
3
+ size 3405421
AMAX_MLP1/summary.json ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_info": {
3
+ "name": "AMAX_MLP1",
4
+ "type": "Multi-Layer Perceptron",
5
+ "framework": "PyTorch",
6
+ "architecture": "Sequential Neural Network",
7
+ "created_date": "2024-10-12"
8
+ },
9
+ "architecture": {
10
+ "input_size": 312,
11
+ "hidden_layers": [1024, 512],
12
+ "output_size": 1,
13
+ "activation": "tanh",
14
+ "dropout_rate": 0.1,
15
+ "total_parameters": "~1.2M",
16
+ "model_depth": 4
17
+ },
18
+ "training_config": {
19
+ "optimizer": "Adam",
20
+ "learning_rate": 0.01,
21
+ "batch_size": 2048,
22
+ "max_epochs": 150,
23
+ "patience": 25,
24
+ "early_stopping": true,
25
+ "device": "cuda",
26
+ "data_leakage": false
27
+ },
28
+ "performance_metrics": {
29
+ "validation": {
30
+ "r2_score": 0.8664,
31
+ "mae": 26.187,
32
+ "rmse": 42.885,
33
+ "realistic": true
34
+ },
35
+ "test": {
36
+ "r2_score": 0.8913,
37
+ "mae": 23.956,
38
+ "rmse": 38.680,
39
+ "realistic": true
40
+ }
41
+ },
42
+ "data_info": {
43
+ "training_samples": 32010,
44
+ "validation_samples": 4001,
45
+ "test_samples": 4002,
46
+ "features": 312,
47
+ "feature_type": "RDKit molecular descriptors",
48
+ "target": "lambda_max (nm)",
49
+ "data_split": "random"
50
+ },
51
+ "feature_importance": {
52
+ "method": "permutation_importance",
53
+ "top_descriptors": [
54
+ "BertzCT_comp",
55
+ "NumHAcceptors_comp",
56
+ "SLogP_VSA8_comp",
57
+ "SMR_VSA8_comp",
58
+ "fr_halogen_comp"
59
+ ]
60
+ }
61
+ }
AMAX_RF1/AMAX_RF1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:745133fdcad01a03aec3d82bb8311c62795869c20698b1de9081ff8ded4e7473
3
+ size 1226818065
AMAX_RF1/summary.json ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_info": {
3
+ "name": "AMAX_RF1",
4
+ "type": "Random Forest Regressor",
5
+ "framework": "Scikit-learn",
6
+ "architecture": "Ensemble of Decision Trees",
7
+ "created_date": "2024-10-13",
8
+ "status": "Production Ready (Retrained)"
9
+ },
10
+ "architecture": {
11
+ "n_estimators": 500,
12
+ "max_depth": null,
13
+ "min_samples_split": 2,
14
+ "min_samples_leaf": 1,
15
+ "max_features": "sqrt",
16
+ "bootstrap": true,
17
+ "random_state": 42,
18
+ "n_jobs": 32
19
+ },
20
+ "training_config": {
21
+ "hyperparameter_search": "GridSearchCV",
22
+ "cv_folds": 3,
23
+ "scoring_metric": "neg_mean_absolute_error",
24
+ "parameter_combinations": 32,
25
+ "training_data_only": true,
26
+ "data_leakage": false,
27
+ "retrained": true
28
+ },
29
+ "performance_metrics": {
30
+ "validation": {
31
+ "r2_score": 0.8865,
32
+ "mae": 20.412,
33
+ "rmse": 39.519,
34
+ "realistic": true,
35
+ "note": "Expected after retraining without data leakage"
36
+ },
37
+ "test": {
38
+ "r2_score": 0.9035,
39
+ "mae": 18.601,
40
+ "rmse": 36.441,
41
+ "realistic": true
42
+ }
43
+ },
44
+ "data_info": {
45
+ "training_samples": 32010,
46
+ "validation_samples": 4001,
47
+ "test_samples": 4002,
48
+ "features": 312,
49
+ "feature_type": "RDKit molecular descriptors",
50
+ "target": "lambda_max (nm)",
51
+ "data_split": "random"
52
+ },
53
+ "feature_importance": {
54
+ "method": "built_in_feature_importances",
55
+ "top_descriptors": [
56
+ "NumAliphaticRings_comp",
57
+ "MaxEStateIndex_comp",
58
+ "NumAliphaticHeterocycles_comp",
59
+ "PEOE_VSA8_comp",
60
+ "SMR_VSA9_comp"
61
+ ]
62
+ }
63
+ }
AMAX_XGB1/AMAX_XGB1.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f7a42db68102b384847ac68a3940887682cdf0d201fa0a1e1307ed87de1927c6
3
+ size 8038457
AMAX_XGB1/summary.json ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_info": {
3
+ "name": "AMAX_XGB1",
4
+ "type": "XGBoost Regressor",
5
+ "framework": "XGBoost",
6
+ "architecture": "Gradient Boosting Decision Trees",
7
+ "created_date": "2024-10-13",
8
+ "status": "Production Ready (Retrained)"
9
+ },
10
+ "architecture": {
11
+ "n_estimators": 500,
12
+ "max_depth": 9,
13
+ "learning_rate": 0.1,
14
+ "subsample": 0.8,
15
+ "colsample_bytree": 0.8,
16
+ "reg_alpha": 0.0,
17
+ "reg_lambda": 1.0,
18
+ "random_state": 42,
19
+ "tree_method": "hist",
20
+ "device": "cuda"
21
+ },
22
+ "training_config": {
23
+ "hyperparameter_search": "GridSearchCV",
24
+ "cv_folds": 3,
25
+ "scoring_metric": "neg_mean_absolute_error",
26
+ "parameter_combinations": 32,
27
+ "training_data_only": true,
28
+ "data_leakage": false,
29
+ "retrained": true,
30
+ "gpu_acceleration": true
31
+ },
32
+ "performance_metrics": {
33
+ "validation": {
34
+ "r2_score": 0.8882,
35
+ "mae": 19.567,
36
+ "rmse": 39.219,
37
+ "realistic": true,
38
+ "note": "Expected after retraining without data leakage"
39
+ },
40
+ "test": {
41
+ "r2_score": 0.9084,
42
+ "mae": 17.682,
43
+ "rmse": 35.507,
44
+ "realistic": true
45
+ }
46
+ },
47
+ "data_info": {
48
+ "training_samples": 32010,
49
+ "validation_samples": 4001,
50
+ "test_samples": 4002,
51
+ "features": 312,
52
+ "feature_type": "RDKit molecular descriptors",
53
+ "target": "lambda_max (nm)",
54
+ "data_split": "random"
55
+ },
56
+ "feature_importance": {
57
+ "method": "built_in_feature_importances",
58
+ "top_descriptors": [
59
+ "NumAliphaticRings_comp",
60
+ "fr_Imine_comp",
61
+ "NumAliphaticHeterocycles_comp",
62
+ "fr_azo_comp",
63
+ "fr_COO_comp"
64
+ ]
65
+ }
66
+ }
README.md CHANGED
@@ -1,3 +1,35 @@
1
  ---
2
  license: mit
 
 
 
 
3
  ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
  license: mit
3
+ tags:
4
+ - chemistry
5
+ - molecular-property-prediction
6
+ - drug-discovery
7
  ---
8
+
9
+ # AMAX Models: Molecular Absorption Wavelength Prediction
10
+
11
+ A collection of machine learning models for predicting maximum absorption wavelength (λ_max) of chemical compounds in various solvents. These models use molecular descriptors to predict spectroscopic properties, useful for drug discovery, materials science, and computational chemistry applications.
12
+
13
+ ## 🤖 Available Models
14
+
15
+ | Model | Framework | Architecture | R² Score | MAE (nm) | RMSE (nm) | Status |
16
+ |-------|-----------|--------------|----------|----------|-----------|---------|
17
+ | **AMAX_XGB1** | XGBoost | Gradient Boosting (500 estimators) | 0.9084 | 17.682 | 35.507 | Active |
18
+ | **AMAX_RF1** | Scikit-Learn | Random Forest (500 trees) | 0.9035 | 18.601 | 36.441 | Active |
19
+ | **AMAX_MLP1** | PyTorch | Sequential NN (1024 → 512) | 0.8913 | 23.956 | 38.680 | Active |
20
+
21
+ All models utilize **312 RDKit molecular descriptors** combining both compound and solvent features, trained on a random data split of **32,010 training samples** with **4,001 validation** and **4,002 test samples**. Each model has been retrained to eliminate data leakage and ensure robust performance evaluation.
22
+
23
+ ## 📄 Citation
24
+
25
+ If you use an AMAX prediction model in your research, please cite:
26
+
27
+ ```bibtex
28
+ @misc{amaxmodels,
29
+ title={AMAX-Models: Machine Learning Models for Molecular Absorption Wavelength Prediction},
30
+ author={Leung, Nathan},
31
+ institution={Coley Research Group @ MIT}
32
+ year={2025},
33
+ howpublished={\url{https://huggingface.co/natelgrw/AMAX-Models}},
34
+ }
35
+ ```