initial commit
Browse files- .DS_Store +0 -0
- AMAX_MLP1/AMAX_MLP1.pth +3 -0
- AMAX_MLP1/summary.json +61 -0
- AMAX_RF1/AMAX_RF1.pkl +3 -0
- AMAX_RF1/summary.json +63 -0
- AMAX_XGB1/AMAX_XGB1.pkl +3 -0
- AMAX_XGB1/summary.json +66 -0
- README.md +32 -0
.DS_Store
ADDED
|
Binary file (6.15 kB). View file
|
|
|
AMAX_MLP1/AMAX_MLP1.pth
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:cf9816fdd6f1243656b8553a8f3f32caf288ed0928f10927939ba4cf53bbc0e0
|
| 3 |
+
size 3405421
|
AMAX_MLP1/summary.json
ADDED
|
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_info": {
|
| 3 |
+
"name": "AMAX_MLP1",
|
| 4 |
+
"type": "Multi-Layer Perceptron",
|
| 5 |
+
"framework": "PyTorch",
|
| 6 |
+
"architecture": "Sequential Neural Network",
|
| 7 |
+
"created_date": "2024-10-12"
|
| 8 |
+
},
|
| 9 |
+
"architecture": {
|
| 10 |
+
"input_size": 312,
|
| 11 |
+
"hidden_layers": [1024, 512],
|
| 12 |
+
"output_size": 1,
|
| 13 |
+
"activation": "tanh",
|
| 14 |
+
"dropout_rate": 0.1,
|
| 15 |
+
"total_parameters": "~1.2M",
|
| 16 |
+
"model_depth": 4
|
| 17 |
+
},
|
| 18 |
+
"training_config": {
|
| 19 |
+
"optimizer": "Adam",
|
| 20 |
+
"learning_rate": 0.01,
|
| 21 |
+
"batch_size": 2048,
|
| 22 |
+
"max_epochs": 150,
|
| 23 |
+
"patience": 25,
|
| 24 |
+
"early_stopping": true,
|
| 25 |
+
"device": "cuda",
|
| 26 |
+
"data_leakage": false
|
| 27 |
+
},
|
| 28 |
+
"performance_metrics": {
|
| 29 |
+
"validation": {
|
| 30 |
+
"r2_score": 0.8664,
|
| 31 |
+
"mae": 26.187,
|
| 32 |
+
"rmse": 42.885,
|
| 33 |
+
"realistic": true
|
| 34 |
+
},
|
| 35 |
+
"test": {
|
| 36 |
+
"r2_score": 0.8913,
|
| 37 |
+
"mae": 23.956,
|
| 38 |
+
"rmse": 38.680,
|
| 39 |
+
"realistic": true
|
| 40 |
+
}
|
| 41 |
+
},
|
| 42 |
+
"data_info": {
|
| 43 |
+
"training_samples": 32010,
|
| 44 |
+
"validation_samples": 4001,
|
| 45 |
+
"test_samples": 4002,
|
| 46 |
+
"features": 312,
|
| 47 |
+
"feature_type": "RDKit molecular descriptors",
|
| 48 |
+
"target": "lambda_max (nm)",
|
| 49 |
+
"data_split": "random"
|
| 50 |
+
},
|
| 51 |
+
"feature_importance": {
|
| 52 |
+
"method": "permutation_importance",
|
| 53 |
+
"top_descriptors": [
|
| 54 |
+
"BertzCT_comp",
|
| 55 |
+
"NumHAcceptors_comp",
|
| 56 |
+
"SLogP_VSA8_comp",
|
| 57 |
+
"SMR_VSA8_comp",
|
| 58 |
+
"fr_halogen_comp"
|
| 59 |
+
]
|
| 60 |
+
}
|
| 61 |
+
}
|
AMAX_RF1/AMAX_RF1.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:745133fdcad01a03aec3d82bb8311c62795869c20698b1de9081ff8ded4e7473
|
| 3 |
+
size 1226818065
|
AMAX_RF1/summary.json
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_info": {
|
| 3 |
+
"name": "AMAX_RF1",
|
| 4 |
+
"type": "Random Forest Regressor",
|
| 5 |
+
"framework": "Scikit-learn",
|
| 6 |
+
"architecture": "Ensemble of Decision Trees",
|
| 7 |
+
"created_date": "2024-10-13",
|
| 8 |
+
"status": "Production Ready (Retrained)"
|
| 9 |
+
},
|
| 10 |
+
"architecture": {
|
| 11 |
+
"n_estimators": 500,
|
| 12 |
+
"max_depth": null,
|
| 13 |
+
"min_samples_split": 2,
|
| 14 |
+
"min_samples_leaf": 1,
|
| 15 |
+
"max_features": "sqrt",
|
| 16 |
+
"bootstrap": true,
|
| 17 |
+
"random_state": 42,
|
| 18 |
+
"n_jobs": 32
|
| 19 |
+
},
|
| 20 |
+
"training_config": {
|
| 21 |
+
"hyperparameter_search": "GridSearchCV",
|
| 22 |
+
"cv_folds": 3,
|
| 23 |
+
"scoring_metric": "neg_mean_absolute_error",
|
| 24 |
+
"parameter_combinations": 32,
|
| 25 |
+
"training_data_only": true,
|
| 26 |
+
"data_leakage": false,
|
| 27 |
+
"retrained": true
|
| 28 |
+
},
|
| 29 |
+
"performance_metrics": {
|
| 30 |
+
"validation": {
|
| 31 |
+
"r2_score": 0.8865,
|
| 32 |
+
"mae": 20.412,
|
| 33 |
+
"rmse": 39.519,
|
| 34 |
+
"realistic": true,
|
| 35 |
+
"note": "Expected after retraining without data leakage"
|
| 36 |
+
},
|
| 37 |
+
"test": {
|
| 38 |
+
"r2_score": 0.9035,
|
| 39 |
+
"mae": 18.601,
|
| 40 |
+
"rmse": 36.441,
|
| 41 |
+
"realistic": true
|
| 42 |
+
}
|
| 43 |
+
},
|
| 44 |
+
"data_info": {
|
| 45 |
+
"training_samples": 32010,
|
| 46 |
+
"validation_samples": 4001,
|
| 47 |
+
"test_samples": 4002,
|
| 48 |
+
"features": 312,
|
| 49 |
+
"feature_type": "RDKit molecular descriptors",
|
| 50 |
+
"target": "lambda_max (nm)",
|
| 51 |
+
"data_split": "random"
|
| 52 |
+
},
|
| 53 |
+
"feature_importance": {
|
| 54 |
+
"method": "built_in_feature_importances",
|
| 55 |
+
"top_descriptors": [
|
| 56 |
+
"NumAliphaticRings_comp",
|
| 57 |
+
"MaxEStateIndex_comp",
|
| 58 |
+
"NumAliphaticHeterocycles_comp",
|
| 59 |
+
"PEOE_VSA8_comp",
|
| 60 |
+
"SMR_VSA9_comp"
|
| 61 |
+
]
|
| 62 |
+
}
|
| 63 |
+
}
|
AMAX_XGB1/AMAX_XGB1.pkl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:f7a42db68102b384847ac68a3940887682cdf0d201fa0a1e1307ed87de1927c6
|
| 3 |
+
size 8038457
|
AMAX_XGB1/summary.json
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"model_info": {
|
| 3 |
+
"name": "AMAX_XGB1",
|
| 4 |
+
"type": "XGBoost Regressor",
|
| 5 |
+
"framework": "XGBoost",
|
| 6 |
+
"architecture": "Gradient Boosting Decision Trees",
|
| 7 |
+
"created_date": "2024-10-13",
|
| 8 |
+
"status": "Production Ready (Retrained)"
|
| 9 |
+
},
|
| 10 |
+
"architecture": {
|
| 11 |
+
"n_estimators": 500,
|
| 12 |
+
"max_depth": 9,
|
| 13 |
+
"learning_rate": 0.1,
|
| 14 |
+
"subsample": 0.8,
|
| 15 |
+
"colsample_bytree": 0.8,
|
| 16 |
+
"reg_alpha": 0.0,
|
| 17 |
+
"reg_lambda": 1.0,
|
| 18 |
+
"random_state": 42,
|
| 19 |
+
"tree_method": "hist",
|
| 20 |
+
"device": "cuda"
|
| 21 |
+
},
|
| 22 |
+
"training_config": {
|
| 23 |
+
"hyperparameter_search": "GridSearchCV",
|
| 24 |
+
"cv_folds": 3,
|
| 25 |
+
"scoring_metric": "neg_mean_absolute_error",
|
| 26 |
+
"parameter_combinations": 32,
|
| 27 |
+
"training_data_only": true,
|
| 28 |
+
"data_leakage": false,
|
| 29 |
+
"retrained": true,
|
| 30 |
+
"gpu_acceleration": true
|
| 31 |
+
},
|
| 32 |
+
"performance_metrics": {
|
| 33 |
+
"validation": {
|
| 34 |
+
"r2_score": 0.8882,
|
| 35 |
+
"mae": 19.567,
|
| 36 |
+
"rmse": 39.219,
|
| 37 |
+
"realistic": true,
|
| 38 |
+
"note": "Expected after retraining without data leakage"
|
| 39 |
+
},
|
| 40 |
+
"test": {
|
| 41 |
+
"r2_score": 0.9084,
|
| 42 |
+
"mae": 17.682,
|
| 43 |
+
"rmse": 35.507,
|
| 44 |
+
"realistic": true
|
| 45 |
+
}
|
| 46 |
+
},
|
| 47 |
+
"data_info": {
|
| 48 |
+
"training_samples": 32010,
|
| 49 |
+
"validation_samples": 4001,
|
| 50 |
+
"test_samples": 4002,
|
| 51 |
+
"features": 312,
|
| 52 |
+
"feature_type": "RDKit molecular descriptors",
|
| 53 |
+
"target": "lambda_max (nm)",
|
| 54 |
+
"data_split": "random"
|
| 55 |
+
},
|
| 56 |
+
"feature_importance": {
|
| 57 |
+
"method": "built_in_feature_importances",
|
| 58 |
+
"top_descriptors": [
|
| 59 |
+
"NumAliphaticRings_comp",
|
| 60 |
+
"fr_Imine_comp",
|
| 61 |
+
"NumAliphaticHeterocycles_comp",
|
| 62 |
+
"fr_azo_comp",
|
| 63 |
+
"fr_COO_comp"
|
| 64 |
+
]
|
| 65 |
+
}
|
| 66 |
+
}
|
README.md
CHANGED
|
@@ -1,3 +1,35 @@
|
|
| 1 |
---
|
| 2 |
license: mit
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
---
|
| 2 |
license: mit
|
| 3 |
+
tags:
|
| 4 |
+
- chemistry
|
| 5 |
+
- molecular-property-prediction
|
| 6 |
+
- drug-discovery
|
| 7 |
---
|
| 8 |
+
|
| 9 |
+
# AMAX Models: Molecular Absorption Wavelength Prediction
|
| 10 |
+
|
| 11 |
+
A collection of machine learning models for predicting maximum absorption wavelength (λ_max) of chemical compounds in various solvents. These models use molecular descriptors to predict spectroscopic properties, useful for drug discovery, materials science, and computational chemistry applications.
|
| 12 |
+
|
| 13 |
+
## 🤖 Available Models
|
| 14 |
+
|
| 15 |
+
| Model | Framework | Architecture | R² Score | MAE (nm) | RMSE (nm) | Status |
|
| 16 |
+
|-------|-----------|--------------|----------|----------|-----------|---------|
|
| 17 |
+
| **AMAX_XGB1** | XGBoost | Gradient Boosting (500 estimators) | 0.9084 | 17.682 | 35.507 | Active |
|
| 18 |
+
| **AMAX_RF1** | Scikit-Learn | Random Forest (500 trees) | 0.9035 | 18.601 | 36.441 | Active |
|
| 19 |
+
| **AMAX_MLP1** | PyTorch | Sequential NN (1024 → 512) | 0.8913 | 23.956 | 38.680 | Active |
|
| 20 |
+
|
| 21 |
+
All models utilize **312 RDKit molecular descriptors** combining both compound and solvent features, trained on a random data split of **32,010 training samples** with **4,001 validation** and **4,002 test samples**. Each model has been retrained to eliminate data leakage and ensure robust performance evaluation.
|
| 22 |
+
|
| 23 |
+
## 📄 Citation
|
| 24 |
+
|
| 25 |
+
If you use an AMAX prediction model in your research, please cite:
|
| 26 |
+
|
| 27 |
+
```bibtex
|
| 28 |
+
@misc{amaxmodels,
|
| 29 |
+
title={AMAX-Models: Machine Learning Models for Molecular Absorption Wavelength Prediction},
|
| 30 |
+
author={Leung, Nathan},
|
| 31 |
+
institution={Coley Research Group @ MIT}
|
| 32 |
+
year={2025},
|
| 33 |
+
howpublished={\url{https://huggingface.co/natelgrw/AMAX-Models}},
|
| 34 |
+
}
|
| 35 |
+
```
|