initial commit

Browse files

Files changed (8) hide show

.DS_Store +0 -0
AMAX_MLP1/AMAX_MLP1.pth +3 -0
AMAX_MLP1/summary.json +61 -0
AMAX_RF1/AMAX_RF1.pkl +3 -0
AMAX_RF1/summary.json +63 -0
AMAX_XGB1/AMAX_XGB1.pkl +3 -0
AMAX_XGB1/summary.json +66 -0
README.md +32 -0

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

AMAX_MLP1/AMAX_MLP1.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf9816fdd6f1243656b8553a8f3f32caf288ed0928f10927939ba4cf53bbc0e0
+size 3405421

AMAX_MLP1/summary.json ADDED Viewed

	@@ -0,0 +1,61 @@

+{
+  "model_info": {
+    "name": "AMAX_MLP1",
+    "type": "Multi-Layer Perceptron",
+    "framework": "PyTorch",
+    "architecture": "Sequential Neural Network",
+    "created_date": "2024-10-12"
+  },
+  "architecture": {
+    "input_size": 312,
+    "hidden_layers": [1024, 512],
+    "output_size": 1,
+    "activation": "tanh",
+    "dropout_rate": 0.1,
+    "total_parameters": "~1.2M",
+    "model_depth": 4
+  },
+  "training_config": {
+    "optimizer": "Adam",
+    "learning_rate": 0.01,
+    "batch_size": 2048,
+    "max_epochs": 150,
+    "patience": 25,
+    "early_stopping": true,
+    "device": "cuda",
+    "data_leakage": false
+  },
+  "performance_metrics": {
+    "validation": {
+      "r2_score": 0.8664,
+      "mae": 26.187,
+      "rmse": 42.885,
+      "realistic": true
+    },
+    "test": {
+      "r2_score": 0.8913,
+      "mae": 23.956,
+      "rmse": 38.680,
+      "realistic": true
+    }
+  },
+  "data_info": {
+    "training_samples": 32010,
+    "validation_samples": 4001,
+    "test_samples": 4002,
+    "features": 312,
+    "feature_type": "RDKit molecular descriptors",
+    "target": "lambda_max (nm)",
+    "data_split": "random"
+  },
+  "feature_importance": {
+    "method": "permutation_importance",
+    "top_descriptors": [
+      "BertzCT_comp",
+      "NumHAcceptors_comp",
+      "SLogP_VSA8_comp",
+      "SMR_VSA8_comp",
+      "fr_halogen_comp"
+    ]
+  }
+}

AMAX_RF1/AMAX_RF1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:745133fdcad01a03aec3d82bb8311c62795869c20698b1de9081ff8ded4e7473
+size 1226818065

AMAX_RF1/summary.json ADDED Viewed

	@@ -0,0 +1,63 @@

+{
+  "model_info": {
+    "name": "AMAX_RF1",
+    "type": "Random Forest Regressor",
+    "framework": "Scikit-learn",
+    "architecture": "Ensemble of Decision Trees",
+    "created_date": "2024-10-13",
+    "status": "Production Ready (Retrained)"
+  },
+  "architecture": {
+    "n_estimators": 500,
+    "max_depth": null,
+    "min_samples_split": 2,
+    "min_samples_leaf": 1,
+    "max_features": "sqrt",
+    "bootstrap": true,
+    "random_state": 42,
+    "n_jobs": 32
+  },
+  "training_config": {
+    "hyperparameter_search": "GridSearchCV",
+    "cv_folds": 3,
+    "scoring_metric": "neg_mean_absolute_error",
+    "parameter_combinations": 32,
+    "training_data_only": true,
+    "data_leakage": false,
+    "retrained": true
+  },
+  "performance_metrics": {
+    "validation": {
+      "r2_score": 0.8865,
+      "mae": 20.412,
+      "rmse": 39.519,
+      "realistic": true,
+      "note": "Expected after retraining without data leakage"
+    },
+    "test": {
+      "r2_score": 0.9035,
+      "mae": 18.601,
+      "rmse": 36.441,
+      "realistic": true
+    }
+  },
+  "data_info": {
+    "training_samples": 32010,
+    "validation_samples": 4001,
+    "test_samples": 4002,
+    "features": 312,
+    "feature_type": "RDKit molecular descriptors",
+    "target": "lambda_max (nm)",
+    "data_split": "random"
+  },
+  "feature_importance": {
+    "method": "built_in_feature_importances",
+    "top_descriptors": [
+      "NumAliphaticRings_comp",
+      "MaxEStateIndex_comp",
+      "NumAliphaticHeterocycles_comp",
+      "PEOE_VSA8_comp",
+      "SMR_VSA9_comp"
+    ]
+  }
+}

AMAX_XGB1/AMAX_XGB1.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f7a42db68102b384847ac68a3940887682cdf0d201fa0a1e1307ed87de1927c6
+size 8038457

AMAX_XGB1/summary.json ADDED Viewed

	@@ -0,0 +1,66 @@

+{
+  "model_info": {
+    "name": "AMAX_XGB1",
+    "type": "XGBoost Regressor",
+    "framework": "XGBoost",
+    "architecture": "Gradient Boosting Decision Trees",
+    "created_date": "2024-10-13",
+    "status": "Production Ready (Retrained)"
+  },
+  "architecture": {
+    "n_estimators": 500,
+    "max_depth": 9,
+    "learning_rate": 0.1,
+    "subsample": 0.8,
+    "colsample_bytree": 0.8,
+    "reg_alpha": 0.0,
+    "reg_lambda": 1.0,
+    "random_state": 42,
+    "tree_method": "hist",
+    "device": "cuda"
+  },
+  "training_config": {
+    "hyperparameter_search": "GridSearchCV",
+    "cv_folds": 3,
+    "scoring_metric": "neg_mean_absolute_error",
+    "parameter_combinations": 32,
+    "training_data_only": true,
+    "data_leakage": false,
+    "retrained": true,
+    "gpu_acceleration": true
+  },
+  "performance_metrics": {
+    "validation": {
+      "r2_score": 0.8882,
+      "mae": 19.567,
+      "rmse": 39.219,
+      "realistic": true,
+      "note": "Expected after retraining without data leakage"
+    },
+    "test": {
+      "r2_score": 0.9084,
+      "mae": 17.682,
+      "rmse": 35.507,
+      "realistic": true
+    }
+  },
+  "data_info": {
+    "training_samples": 32010,
+    "validation_samples": 4001,
+    "test_samples": 4002,
+    "features": 312,
+    "feature_type": "RDKit molecular descriptors",
+    "target": "lambda_max (nm)",
+    "data_split": "random"
+  },
+  "feature_importance": {
+    "method": "built_in_feature_importances",
+    "top_descriptors": [
+      "NumAliphaticRings_comp",
+      "fr_Imine_comp",
+      "NumAliphaticHeterocycles_comp",
+      "fr_azo_comp",
+      "fr_COO_comp"
+    ]
+  }
+}

README.md CHANGED Viewed

@@ -1,3 +1,35 @@
 ---
 license: mit
 ---

 ---
 license: mit
+tags:
+- chemistry
+- molecular-property-prediction
+- drug-discovery
 ---
+# AMAX Models: Molecular Absorption Wavelength Prediction
+A collection of machine learning models for predicting maximum absorption wavelength (λ_max) of chemical compounds in various solvents. These models use molecular descriptors to predict spectroscopic properties, useful for drug discovery, materials science, and computational chemistry applications.
+## 🤖 Available Models
+| Model | Framework | Architecture | R² Score | MAE (nm) | RMSE (nm) | Status |
+|-------|-----------|--------------|----------|----------|-----------|---------|
+| **AMAX_XGB1** | XGBoost | Gradient Boosting (500 estimators) | 0.9084 | 17.682 | 35.507 | Active |
+| **AMAX_RF1** | Scikit-Learn | Random Forest (500 trees) | 0.9035 | 18.601 | 36.441 | Active |
+| **AMAX_MLP1** | PyTorch | Sequential NN (1024 → 512) | 0.8913 | 23.956 | 38.680 | Active |
+All models utilize **312 RDKit molecular descriptors** combining both compound and solvent features, trained on a random data split of **32,010 training samples** with **4,001 validation** and **4,002 test samples**. Each model has been retrained to eliminate data leakage and ensure robust performance evaluation.
+## 📄 Citation
+If you use an AMAX prediction model in your research, please cite:
+```bibtex
+@misc{amaxmodels,
+  title={AMAX-Models: Machine Learning Models for Molecular Absorption Wavelength Prediction},
+  author={Leung, Nathan},
+  institution={Coley Research Group @ MIT}
+  year={2025},
+  howpublished={\url{https://huggingface.co/natelgrw/AMAX-Models}},
+}
+```