Spaces:
Build error
Build error
new structure
Browse files- applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc +0 -0
- applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc +0 -0
- applications/3_molecule_generator/__pycache__/main.cpython-310.pyc +0 -0
- applications/3_molecule_generator/__pycache__/results.cpython-310.pyc +0 -0
- applications/__pycache__/__init__.cpython-310.pyc +0 -0
- core/__pycache__/__init__.cpython-310.pyc +0 -0
- core/__pycache__/config.cpython-310.pyc +0 -0
- core/__pycache__/data_prep.cpython-310.pyc +0 -0
- {src → core}/__pycache__/shared_features.cpython-310.pyc +0 -0
- core/evolution/__pycache__/__init__.cpython-310.pyc +0 -0
- core/evolution/__pycache__/evolution.cpython-310.pyc +0 -0
- core/evolution/__pycache__/molecule.cpython-310.pyc +0 -0
- core/evolution/__pycache__/population.cpython-310.pyc +0 -0
- core/predictors/__pycache__/__init__.cpython-310.pyc +0 -0
- core/predictors/pure_component/__pycache__/generic.cpython-310.pyc +0 -0
- core/predictors/pure_component/__pycache__/hf_models.cpython-310.pyc +0 -0
- core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc +0 -0
- src/__pycache__/data_prep.cpython-310.pyc +0 -0
- src/data_prep.py +0 -36
- src/database_main.db +0 -3
- src/diesel_fragments.db +0 -3
- src/main.py +0 -704
- src/model_config.py +0 -53
- src/shared_features.py +0 -233
- src/streamlit_app.py +0 -161
applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (143 Bytes). View file
|
|
|
applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc
ADDED
|
Binary file (1.76 kB). View file
|
|
|
applications/3_molecule_generator/__pycache__/main.cpython-310.pyc
ADDED
|
Binary file (918 Bytes). View file
|
|
|
applications/3_molecule_generator/__pycache__/results.cpython-310.pyc
ADDED
|
Binary file (1.85 kB). View file
|
|
|
applications/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (122 Bytes). View file
|
|
|
core/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (138 Bytes). View file
|
|
|
core/__pycache__/config.cpython-310.pyc
ADDED
|
Binary file (1.25 kB). View file
|
|
|
core/__pycache__/data_prep.cpython-310.pyc
ADDED
|
Binary file (1 kB). View file
|
|
|
{src → core}/__pycache__/shared_features.cpython-310.pyc
RENAMED
|
Binary files a/src/__pycache__/shared_features.cpython-310.pyc and b/core/__pycache__/shared_features.cpython-310.pyc differ
|
|
|
core/evolution/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (148 Bytes). View file
|
|
|
core/evolution/__pycache__/evolution.cpython-310.pyc
ADDED
|
Binary file (7.88 kB). View file
|
|
|
core/evolution/__pycache__/molecule.cpython-310.pyc
ADDED
|
Binary file (1.57 kB). View file
|
|
|
core/evolution/__pycache__/population.cpython-310.pyc
ADDED
|
Binary file (3.7 kB). View file
|
|
|
core/predictors/__pycache__/__init__.cpython-310.pyc
ADDED
|
Binary file (149 Bytes). View file
|
|
|
core/predictors/pure_component/__pycache__/generic.cpython-310.pyc
ADDED
|
Binary file (1.68 kB). View file
|
|
|
core/predictors/pure_component/__pycache__/hf_models.cpython-310.pyc
ADDED
|
Binary file (866 Bytes). View file
|
|
|
core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc
ADDED
|
Binary file (3.31 kB). View file
|
|
|
src/__pycache__/data_prep.cpython-310.pyc
DELETED
|
Binary file (1.14 kB)
|
|
|
src/data_prep.py
DELETED
|
@@ -1,36 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sqlite3
|
| 3 |
-
import pandas as pd
|
| 4 |
-
from sklearn.model_selection import train_test_split
|
| 5 |
-
import os
|
| 6 |
-
|
| 7 |
-
BASE_DIR = os.path.dirname(os.path.abspath(__file__))
|
| 8 |
-
DB_PATH = os.path.join(BASE_DIR, "database_main.db")
|
| 9 |
-
|
| 10 |
-
TARGET_CN = "cn" # Cetane number
|
| 11 |
-
N_FOLDS = 5
|
| 12 |
-
TOP_K = 5
|
| 13 |
-
print("Connecting to SQLite database...")
|
| 14 |
-
conn = sqlite3.connect(DB_PATH)
|
| 15 |
-
cursor = conn.cursor()
|
| 16 |
-
cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
| 17 |
-
print(cursor.fetchall())
|
| 18 |
-
|
| 19 |
-
query = """
|
| 20 |
-
SELECT
|
| 21 |
-
F.Fuel_Name,
|
| 22 |
-
F.SMILES,
|
| 23 |
-
T.Standardised_DCN AS cn
|
| 24 |
-
FROM FUEL F
|
| 25 |
-
LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
|
| 26 |
-
"""
|
| 27 |
-
df = pd.read_sql_query(query, conn)
|
| 28 |
-
conn.close()
|
| 29 |
-
df.dropna(subset=[TARGET_CN, "SMILES"], inplace=True)
|
| 30 |
-
|
| 31 |
-
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
| 32 |
-
print(df.head())
|
| 33 |
-
print(df.columns)
|
| 34 |
-
|
| 35 |
-
def load_data():
|
| 36 |
-
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/database_main.db
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:b14779692bb401ac9fc714a3aa8919d4e14f75aef9f92c6004195d89102ebcff
|
| 3 |
-
size 344064
|
|
|
|
|
|
|
|
|
|
|
|
src/diesel_fragments.db
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:9e76b070ca56ecaaf083602224e59dbff6d5f94c43960e139643c52d93472acb
|
| 3 |
-
size 10002432
|
|
|
|
|
|
|
|
|
|
|
|
src/main.py
DELETED
|
@@ -1,704 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sys
|
| 3 |
-
from pathlib import Path
|
| 4 |
-
from dataclasses import dataclass, asdict, field
|
| 5 |
-
from typing import List, Dict, Optional, Tuple, Callable
|
| 6 |
-
import joblib
|
| 7 |
-
import numpy as np
|
| 8 |
-
import pandas as pd
|
| 9 |
-
import random
|
| 10 |
-
from rdkit import Chem
|
| 11 |
-
from crem.crem import mutate_mol
|
| 12 |
-
from sklearn.base import BaseEstimator, RegressorMixin
|
| 13 |
-
from joblib import Parallel, delayed
|
| 14 |
-
from tqdm import tqdm
|
| 15 |
-
from huggingface_hub import snapshot_download
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
HF_MODEL_REPOS = {
|
| 20 |
-
"cn": "SalZa2004/Cetane_Number_Predictor",
|
| 21 |
-
"ysi": "SalZa2004/YSI_Predictor",
|
| 22 |
-
"density": "SalZa2004/Density_Predictor",
|
| 23 |
-
"lhv": "SalZa2004/LHV_Predictor",
|
| 24 |
-
"dynamic_viscosity": "SalZa2004/Dynamic_Viscosity_Predictor",
|
| 25 |
-
"bp": "SalZa2004/Boiling_Point_Predictor",
|
| 26 |
-
}
|
| 27 |
-
|
| 28 |
-
def get_hf_model_dir(repo_id: str) -> Path:
|
| 29 |
-
"""
|
| 30 |
-
Download a Hugging Face model repo and return local path.
|
| 31 |
-
Uses HF cache automatically.
|
| 32 |
-
"""
|
| 33 |
-
local_dir = snapshot_download(
|
| 34 |
-
repo_id=repo_id,
|
| 35 |
-
repo_type="model",
|
| 36 |
-
local_dir=None, # use HF cache
|
| 37 |
-
local_dir_use_symlinks=True
|
| 38 |
-
)
|
| 39 |
-
return Path(local_dir)
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
# === Project Setup ===
|
| 43 |
-
PROJECT_ROOT = Path.cwd()
|
| 44 |
-
SRC_DIR = PROJECT_ROOT / "src"
|
| 45 |
-
sys.path.append(str(PROJECT_ROOT))
|
| 46 |
-
|
| 47 |
-
INITIAL_PRED_CACHE = PROJECT_ROOT / "cache" / "initial_predictions.parquet"
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
from shared_features import FeatureSelector, featurize_df
|
| 51 |
-
from data_prep import df
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
class GenericPredictor:
|
| 55 |
-
"""Generic predictor that works for any property model."""
|
| 56 |
-
|
| 57 |
-
def __init__(self, model_dir: Path, property_name: str):
|
| 58 |
-
self.property_name = property_name
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
model_path = model_dir / "model.joblib"
|
| 62 |
-
selector_path = model_dir / "selector.joblib"
|
| 63 |
-
|
| 64 |
-
if not model_path.exists():
|
| 65 |
-
raise FileNotFoundError(f"Missing model.joblib in {model_dir}")
|
| 66 |
-
if not selector_path.exists():
|
| 67 |
-
raise FileNotFoundError(f"Missing selector.joblib in {model_dir}")
|
| 68 |
-
|
| 69 |
-
self.model = joblib.load(model_path)
|
| 70 |
-
self.selector = FeatureSelector.load(selector_path)
|
| 71 |
-
|
| 72 |
-
print(f"✓ Loaded {property_name} predictor")
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
def predict(self, smiles_list):
|
| 76 |
-
"""Inference on a list of SMILES strings."""
|
| 77 |
-
if isinstance(smiles_list, str):
|
| 78 |
-
smiles_list = [smiles_list]
|
| 79 |
-
|
| 80 |
-
X_full = featurize_df(smiles_list, return_df=False)
|
| 81 |
-
|
| 82 |
-
if X_full is None:
|
| 83 |
-
print(f"⚠ Warning: No valid molecules found for {self.property_name}!")
|
| 84 |
-
return []
|
| 85 |
-
|
| 86 |
-
X_selected = self.selector.transform(X_full)
|
| 87 |
-
predictions = self.model.predict(X_selected)
|
| 88 |
-
return predictions.tolist()
|
| 89 |
-
|
| 90 |
-
def predict_with_details(self, smiles_list):
|
| 91 |
-
"""Inference with valid/invalid info."""
|
| 92 |
-
if isinstance(smiles_list, str):
|
| 93 |
-
smiles_list = [smiles_list]
|
| 94 |
-
|
| 95 |
-
df = pd.DataFrame({"SMILES": smiles_list})
|
| 96 |
-
X_full, df_valid = featurize_df(df, return_df=True)
|
| 97 |
-
|
| 98 |
-
col_name = f"Predicted_{self.property_name}"
|
| 99 |
-
|
| 100 |
-
if X_full is None:
|
| 101 |
-
return pd.DataFrame(columns=["SMILES", col_name, "Valid"])
|
| 102 |
-
|
| 103 |
-
X_selected = self.selector.transform(X_full)
|
| 104 |
-
predictions = self.model.predict(X_selected)
|
| 105 |
-
|
| 106 |
-
df_valid[col_name] = predictions
|
| 107 |
-
df_valid["Valid"] = True
|
| 108 |
-
|
| 109 |
-
all_results = pd.DataFrame({"SMILES": smiles_list})
|
| 110 |
-
all_results = all_results.merge(
|
| 111 |
-
df_valid[["SMILES", col_name, "Valid"]],
|
| 112 |
-
on="SMILES", how="left"
|
| 113 |
-
)
|
| 114 |
-
all_results["Valid"] = all_results["Valid"].fillna(False)
|
| 115 |
-
|
| 116 |
-
return all_results
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
@dataclass
|
| 121 |
-
class EvolutionConfig:
|
| 122 |
-
"""Configuration for evolutionary algorithm."""
|
| 123 |
-
target_cn: float
|
| 124 |
-
minimize_ysi: bool = True
|
| 125 |
-
generations: int = 6
|
| 126 |
-
population_size: int = 50
|
| 127 |
-
mutations_per_parent: int = 5
|
| 128 |
-
survivor_fraction: float = 0.5
|
| 129 |
-
min_bp: float = 60
|
| 130 |
-
max_bp: float = 250
|
| 131 |
-
min_dynamic_viscosity: float = 0.0
|
| 132 |
-
max_dynamic_viscosity: float = 2.0
|
| 133 |
-
min_density: float = 720
|
| 134 |
-
min_lhv: float = 30
|
| 135 |
-
use_bp_filter: bool = True
|
| 136 |
-
use_density_filter: bool = True
|
| 137 |
-
use_lhv_filter: bool = True
|
| 138 |
-
use_dynamic_viscosity_filter: bool = True
|
| 139 |
-
batch_size: int = 200 # Increased default for better throughput
|
| 140 |
-
max_offspring_attempts: int = 10
|
| 141 |
-
n_jobs: int = -1 # Number of parallel jobs for mutation (-1 = all cores)
|
| 142 |
-
|
| 143 |
-
def __post_init__(self):
|
| 144 |
-
"""Validate configuration parameters."""
|
| 145 |
-
if self.target_cn < 0:
|
| 146 |
-
raise ValueError("target_cn must be positive")
|
| 147 |
-
if not 0 < self.survivor_fraction < 1:
|
| 148 |
-
raise ValueError("survivor_fraction must be between 0 and 1")
|
| 149 |
-
if self.min_bp >= self.max_bp:
|
| 150 |
-
raise ValueError("min_bp must be less than max_bp")
|
| 151 |
-
if self.population_size < 2:
|
| 152 |
-
raise ValueError("population_size must be at least 2")
|
| 153 |
-
|
| 154 |
-
@dataclass
|
| 155 |
-
class Molecule:
|
| 156 |
-
"""Represents a molecule with its properties."""
|
| 157 |
-
smiles: str
|
| 158 |
-
cn: float
|
| 159 |
-
cn_error: float
|
| 160 |
-
bp: Optional[float] = None
|
| 161 |
-
ysi: Optional[float] = None
|
| 162 |
-
density: Optional[float] = None
|
| 163 |
-
lhv: Optional[float] = None
|
| 164 |
-
dynamic_viscosity: Optional[float] = None
|
| 165 |
-
_mol_cache: Optional[Chem.Mol] = field(default=None, repr=False, compare=False)
|
| 166 |
-
|
| 167 |
-
def get_mol(self) -> Optional[Chem.Mol]:
|
| 168 |
-
"""Get cached RDKit Mol object to avoid repeated conversions."""
|
| 169 |
-
if self._mol_cache is None:
|
| 170 |
-
self._mol_cache = Chem.MolFromSmiles(self.smiles)
|
| 171 |
-
return self._mol_cache
|
| 172 |
-
|
| 173 |
-
def dominates(self, other: 'Molecule') -> bool:
|
| 174 |
-
"""Check if this molecule Pareto-dominates another."""
|
| 175 |
-
better_cn = self.cn_error <= other.cn_error
|
| 176 |
-
better_ysi = self.ysi <= other.ysi if self.ysi is not None else True
|
| 177 |
-
strictly_better = (self.cn_error < other.cn_error or
|
| 178 |
-
(self.ysi is not None and self.ysi < other.ysi))
|
| 179 |
-
return better_cn and better_ysi and strictly_better
|
| 180 |
-
|
| 181 |
-
def to_dict(self) -> Dict:
|
| 182 |
-
"""Convert to dictionary for DataFrame creation."""
|
| 183 |
-
return {k: v for k, v in asdict(self).items()
|
| 184 |
-
if v is not None and k != '_mol_cache'}
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
class PropertyPredictor:
|
| 188 |
-
"""Handles batch prediction for all molecular properties with caching."""
|
| 189 |
-
|
| 190 |
-
def __init__(self, config: EvolutionConfig):
|
| 191 |
-
self.config = config
|
| 192 |
-
self.predictors = {}
|
| 193 |
-
self.prediction_cache = {}
|
| 194 |
-
|
| 195 |
-
# --- Always load CN ---
|
| 196 |
-
cn_dir = get_hf_model_dir(HF_MODEL_REPOS["cn"])
|
| 197 |
-
self.predictors["cn"] = GenericPredictor(
|
| 198 |
-
cn_dir,
|
| 199 |
-
"Cetane Number"
|
| 200 |
-
)
|
| 201 |
-
|
| 202 |
-
# --- Optional predictors ---
|
| 203 |
-
if config.minimize_ysi:
|
| 204 |
-
ysi_dir = get_hf_model_dir(HF_MODEL_REPOS["ysi"])
|
| 205 |
-
self.predictors["ysi"] = GenericPredictor(
|
| 206 |
-
ysi_dir,
|
| 207 |
-
"YSI"
|
| 208 |
-
)
|
| 209 |
-
|
| 210 |
-
if config.use_bp_filter:
|
| 211 |
-
bp_dir = get_hf_model_dir(HF_MODEL_REPOS["bp"])
|
| 212 |
-
self.predictors["bp"] = GenericPredictor(
|
| 213 |
-
bp_dir,
|
| 214 |
-
"Boiling Point"
|
| 215 |
-
)
|
| 216 |
-
|
| 217 |
-
if config.use_density_filter:
|
| 218 |
-
density_dir = get_hf_model_dir(HF_MODEL_REPOS["density"])
|
| 219 |
-
self.predictors["density"] = GenericPredictor(
|
| 220 |
-
density_dir,
|
| 221 |
-
"Density"
|
| 222 |
-
)
|
| 223 |
-
|
| 224 |
-
if config.use_lhv_filter:
|
| 225 |
-
lhv_dir = get_hf_model_dir(HF_MODEL_REPOS["lhv"])
|
| 226 |
-
self.predictors["lhv"] = GenericPredictor(
|
| 227 |
-
lhv_dir,
|
| 228 |
-
"Lower Heating Value"
|
| 229 |
-
)
|
| 230 |
-
|
| 231 |
-
if config.use_dynamic_viscosity_filter:
|
| 232 |
-
dv_dir = get_hf_model_dir(HF_MODEL_REPOS["dynamic_viscosity"])
|
| 233 |
-
self.predictors["dynamic_viscosity"] = GenericPredictor(
|
| 234 |
-
dv_dir,
|
| 235 |
-
"Dynamic Viscosity"
|
| 236 |
-
)
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
def _safe_predict(self, predictions: List) -> List[Optional[float]]:
|
| 240 |
-
"""Safely convert predictions, handling None/NaN/inf values."""
|
| 241 |
-
return [
|
| 242 |
-
float(pred) if pred is not None and np.isfinite(pred) else None
|
| 243 |
-
for pred in predictions
|
| 244 |
-
]
|
| 245 |
-
|
| 246 |
-
def _predict_batch(self, property_name: str, smiles_list: List[str]) -> List[Optional[float]]:
|
| 247 |
-
"""Generic batch prediction method."""
|
| 248 |
-
predictor = self.predictors.get(property_name)
|
| 249 |
-
if not smiles_list or predictor is None:
|
| 250 |
-
return [None] * len(smiles_list)
|
| 251 |
-
|
| 252 |
-
try:
|
| 253 |
-
predictions = predictor.predict(smiles_list)
|
| 254 |
-
return self._safe_predict(predictions)
|
| 255 |
-
except Exception as e:
|
| 256 |
-
print(f"⚠️ Warning: {property_name.upper()} prediction failed: {e}")
|
| 257 |
-
return [None] * len(smiles_list)
|
| 258 |
-
|
| 259 |
-
def predict_all_properties(self, smiles_list: List[str]) -> Dict[str, List[Optional[float]]]:
|
| 260 |
-
|
| 261 |
-
if not smiles_list:
|
| 262 |
-
return {}
|
| 263 |
-
|
| 264 |
-
# --- ONE featurization ---
|
| 265 |
-
X_full = featurize_df(smiles_list, return_df=False)
|
| 266 |
-
if X_full is None:
|
| 267 |
-
return {}
|
| 268 |
-
|
| 269 |
-
results = {}
|
| 270 |
-
|
| 271 |
-
for prop, predictor in self.predictors.items():
|
| 272 |
-
try:
|
| 273 |
-
X_sel = predictor.selector.transform(X_full)
|
| 274 |
-
preds = predictor.model.predict(X_sel)
|
| 275 |
-
results[prop] = self._safe_predict(preds)
|
| 276 |
-
except Exception as e:
|
| 277 |
-
print(f"⚠️ {prop} prediction failed: {e}")
|
| 278 |
-
results[prop] = [None] * len(smiles_list)
|
| 279 |
-
|
| 280 |
-
return results
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
class Population:
|
| 285 |
-
"""Manages the population of molecules."""
|
| 286 |
-
|
| 287 |
-
def __init__(self, config: EvolutionConfig):
|
| 288 |
-
self.config = config
|
| 289 |
-
self.molecules: List[Molecule] = []
|
| 290 |
-
self.seen_smiles: set = set()
|
| 291 |
-
|
| 292 |
-
def add_molecule(self, mol: Molecule) -> bool:
|
| 293 |
-
"""Add a molecule if it's not already in the population."""
|
| 294 |
-
if mol.smiles in self.seen_smiles:
|
| 295 |
-
return False
|
| 296 |
-
self.molecules.append(mol)
|
| 297 |
-
self.seen_smiles.add(mol.smiles)
|
| 298 |
-
return True
|
| 299 |
-
|
| 300 |
-
def add_molecules(self, molecules: List[Molecule]) -> int:
|
| 301 |
-
"""Add multiple molecules, return count added."""
|
| 302 |
-
return sum(self.add_molecule(mol) for mol in molecules)
|
| 303 |
-
|
| 304 |
-
def pareto_front(self) -> List[Molecule]:
|
| 305 |
-
"""Extract Pareto front using optimized vectorized operations."""
|
| 306 |
-
if not self.config.minimize_ysi:
|
| 307 |
-
return []
|
| 308 |
-
|
| 309 |
-
n = len(self.molecules)
|
| 310 |
-
if n == 0:
|
| 311 |
-
return []
|
| 312 |
-
|
| 313 |
-
# Create numpy arrays for vectorized operations
|
| 314 |
-
cn_errors = np.array([m.cn_error for m in self.molecules])
|
| 315 |
-
ysis = np.array([m.ysi for m in self.molecules])
|
| 316 |
-
|
| 317 |
-
# Vectorized dominance check
|
| 318 |
-
is_pareto = np.ones(n, dtype=bool)
|
| 319 |
-
for i in range(n):
|
| 320 |
-
if not is_pareto[i]:
|
| 321 |
-
continue
|
| 322 |
-
# Check if any other point dominates point i
|
| 323 |
-
dominates_i = (
|
| 324 |
-
(cn_errors <= cn_errors[i]) &
|
| 325 |
-
(ysis <= ysis[i]) &
|
| 326 |
-
((cn_errors < cn_errors[i]) | (ysis < ysis[i]))
|
| 327 |
-
)
|
| 328 |
-
dominates_i[i] = False # Don't compare with itself
|
| 329 |
-
is_pareto[i] = not np.any(dominates_i)
|
| 330 |
-
|
| 331 |
-
return [self.molecules[i] for i in np.where(is_pareto)[0]]
|
| 332 |
-
|
| 333 |
-
def get_survivors(self) -> List[Molecule]:
|
| 334 |
-
"""Select survivors for the next generation."""
|
| 335 |
-
target_size = int(self.config.population_size * self.config.survivor_fraction)
|
| 336 |
-
|
| 337 |
-
if self.config.minimize_ysi:
|
| 338 |
-
survivors = self.pareto_front()
|
| 339 |
-
|
| 340 |
-
# Sort key for combined objectives
|
| 341 |
-
sort_key = lambda m: m.cn_error + m.ysi
|
| 342 |
-
|
| 343 |
-
if len(survivors) > target_size:
|
| 344 |
-
survivors = sorted(survivors, key=sort_key)[:target_size]
|
| 345 |
-
elif len(survivors) < target_size:
|
| 346 |
-
remainder = [m for m in self.molecules if m not in survivors]
|
| 347 |
-
remainder = sorted(remainder, key=sort_key)
|
| 348 |
-
survivors.extend(remainder[:target_size - len(survivors)])
|
| 349 |
-
else:
|
| 350 |
-
survivors = sorted(self.molecules, key=lambda m: m.cn_error)[:target_size]
|
| 351 |
-
|
| 352 |
-
return survivors
|
| 353 |
-
|
| 354 |
-
def to_dataframe(self) -> pd.DataFrame:
|
| 355 |
-
"""Convert population to DataFrame."""
|
| 356 |
-
df = pd.DataFrame([m.to_dict() for m in self.molecules])
|
| 357 |
-
|
| 358 |
-
sort_cols = ["cn_error", "ysi"] if self.config.minimize_ysi else ["cn_error"]
|
| 359 |
-
df = df.sort_values(sort_cols, ascending=True)
|
| 360 |
-
df.insert(0, 'rank', range(1, len(df) + 1))
|
| 361 |
-
return df
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
class MolecularEvolution:
|
| 365 |
-
"""Main evolutionary algorithm coordinator with optimized performance."""
|
| 366 |
-
|
| 367 |
-
REP_DB_PATH = "diesel_fragments.db"
|
| 368 |
-
|
| 369 |
-
def __init__(self, config: EvolutionConfig):
|
| 370 |
-
self.config = config
|
| 371 |
-
self.predictor = PropertyPredictor(config)
|
| 372 |
-
self.population = Population(config)
|
| 373 |
-
|
| 374 |
-
def _mutate_molecule(self, mol: Chem.Mol) -> List[str]:
|
| 375 |
-
"""Generate mutations for a molecule using CREM with set-based deduplication."""
|
| 376 |
-
try:
|
| 377 |
-
mutants = set(mutate_mol(
|
| 378 |
-
mol,
|
| 379 |
-
db_name=str(self.REP_DB_PATH),
|
| 380 |
-
max_size=2,
|
| 381 |
-
return_mol=False
|
| 382 |
-
))
|
| 383 |
-
# Single set operation instead of list comprehension
|
| 384 |
-
return list(mutants - self.population.seen_smiles)
|
| 385 |
-
except Exception:
|
| 386 |
-
return []
|
| 387 |
-
|
| 388 |
-
def _create_molecules(self, smiles_list: List[str]) -> List[Molecule]:
|
| 389 |
-
"""Create Molecule objects from SMILES with vectorized validation."""
|
| 390 |
-
if not smiles_list:
|
| 391 |
-
return []
|
| 392 |
-
|
| 393 |
-
# Get all predictions at once
|
| 394 |
-
predictions = self.predictor.predict_all_properties(smiles_list)
|
| 395 |
-
|
| 396 |
-
# Vectorized validation using numpy
|
| 397 |
-
n = len(smiles_list)
|
| 398 |
-
cn_vals = np.array(predictions.get('cn', [None] * n))
|
| 399 |
-
valid_mask = ~np.isnan(cn_vals) # Start with CN validity
|
| 400 |
-
|
| 401 |
-
if self.config.minimize_ysi:
|
| 402 |
-
ysi_vals = np.array(predictions.get('ysi', [None] * n))
|
| 403 |
-
valid_mask &= ~np.isnan(ysi_vals)
|
| 404 |
-
|
| 405 |
-
# Vectorized constraint checking
|
| 406 |
-
if 'bp' in predictions and self.config.use_bp_filter:
|
| 407 |
-
bp_vals = np.array([v if v is not None else np.nan for v in predictions['bp']])
|
| 408 |
-
valid_mask &= (bp_vals >= self.config.min_bp) & (bp_vals <= self.config.max_bp)
|
| 409 |
-
|
| 410 |
-
if 'density' in predictions and self.config.use_density_filter:
|
| 411 |
-
density_vals = np.array([v if v is not None else np.nan for v in predictions['density']])
|
| 412 |
-
valid_mask &= (density_vals > self.config.min_density)
|
| 413 |
-
|
| 414 |
-
if 'lhv' in predictions and self.config.use_lhv_filter:
|
| 415 |
-
lhv_vals = np.array([v if v is not None else np.nan for v in predictions['lhv']])
|
| 416 |
-
valid_mask &= (lhv_vals > self.config.min_lhv)
|
| 417 |
-
|
| 418 |
-
if 'dynamic_viscosity' in predictions and self.config.use_dynamic_viscosity_filter:
|
| 419 |
-
dv_vals = np.array([v if v is not None else np.nan for v in predictions['dynamic_viscosity']])
|
| 420 |
-
valid_mask &= (dv_vals > self.config.min_dynamic_viscosity) & (dv_vals <= self.config.max_dynamic_viscosity)
|
| 421 |
-
|
| 422 |
-
# Create molecules only for valid indices
|
| 423 |
-
molecules = []
|
| 424 |
-
for i in np.where(valid_mask)[0]:
|
| 425 |
-
molecules.append(Molecule(
|
| 426 |
-
smiles=smiles_list[i],
|
| 427 |
-
cn=predictions['cn'][i],
|
| 428 |
-
cn_error=abs(predictions['cn'][i] - self.config.target_cn),
|
| 429 |
-
bp=predictions.get('bp', [None]*n)[i],
|
| 430 |
-
ysi=predictions.get('ysi', [None]*n)[i],
|
| 431 |
-
density=predictions.get('density', [None]*n)[i],
|
| 432 |
-
lhv=predictions.get('lhv', [None]*n)[i],
|
| 433 |
-
dynamic_viscosity=predictions.get('dynamic_viscosity', [None]*n)[i]
|
| 434 |
-
))
|
| 435 |
-
|
| 436 |
-
return molecules
|
| 437 |
-
|
| 438 |
-
def initialize_population(self, initial_smiles: List[str]) -> int:
|
| 439 |
-
|
| 440 |
-
cache_path = INITIAL_PRED_CACHE
|
| 441 |
-
cache_path.parent.mkdir(exist_ok=True)
|
| 442 |
-
|
| 443 |
-
if cache_path.exists():
|
| 444 |
-
print("✓ Loading cached initial predictions")
|
| 445 |
-
df_pred = pd.read_parquet(cache_path)
|
| 446 |
-
else:
|
| 447 |
-
print("Predicting properties for initial population (cached)...")
|
| 448 |
-
predictions = self.predictor.predict_all_properties(initial_smiles)
|
| 449 |
-
|
| 450 |
-
df_pred = pd.DataFrame({
|
| 451 |
-
"smiles": initial_smiles,
|
| 452 |
-
**predictions
|
| 453 |
-
})
|
| 454 |
-
|
| 455 |
-
df_pred.to_parquet(cache_path)
|
| 456 |
-
|
| 457 |
-
# --- Apply constraints & build Molecules ---
|
| 458 |
-
molecules = []
|
| 459 |
-
|
| 460 |
-
for _, row in df_pred.iterrows():
|
| 461 |
-
if row["cn"] is None:
|
| 462 |
-
continue
|
| 463 |
-
|
| 464 |
-
if self.config.minimize_ysi and pd.isna(row.get("ysi")):
|
| 465 |
-
continue
|
| 466 |
-
|
| 467 |
-
if self.config.use_bp_filter:
|
| 468 |
-
if not (self.config.min_bp <= row["bp"] <= self.config.max_bp):
|
| 469 |
-
continue
|
| 470 |
-
|
| 471 |
-
if self.config.use_density_filter:
|
| 472 |
-
if row["density"] <= self.config.min_density:
|
| 473 |
-
continue
|
| 474 |
-
|
| 475 |
-
if self.config.use_lhv_filter:
|
| 476 |
-
if row["lhv"] <= self.config.min_lhv:
|
| 477 |
-
continue
|
| 478 |
-
|
| 479 |
-
if self.config.use_dynamic_viscosity_filter:
|
| 480 |
-
if not (
|
| 481 |
-
self.config.min_dynamic_viscosity
|
| 482 |
-
< row["dynamic_viscosity"]
|
| 483 |
-
<= self.config.max_dynamic_viscosity
|
| 484 |
-
):
|
| 485 |
-
continue
|
| 486 |
-
|
| 487 |
-
molecules.append(
|
| 488 |
-
Molecule(
|
| 489 |
-
smiles=row["smiles"],
|
| 490 |
-
cn=row["cn"],
|
| 491 |
-
cn_error=abs(row["cn"] - self.config.target_cn),
|
| 492 |
-
bp=row["bp"],
|
| 493 |
-
ysi=row.get("ysi"),
|
| 494 |
-
density=row["density"],
|
| 495 |
-
lhv=row["lhv"],
|
| 496 |
-
dynamic_viscosity=row["dynamic_viscosity"]
|
| 497 |
-
)
|
| 498 |
-
)
|
| 499 |
-
|
| 500 |
-
return self.population.add_molecules(molecules)
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
def _log_generation_stats(self, generation: int):
|
| 504 |
-
"""Log statistics for the current generation."""
|
| 505 |
-
mols = self.population.molecules
|
| 506 |
-
best_cn = min(mols, key=lambda m: m.cn_error)
|
| 507 |
-
avg_cn_err = np.mean([m.cn_error for m in mols])
|
| 508 |
-
|
| 509 |
-
log_dict = {
|
| 510 |
-
"generation": generation,
|
| 511 |
-
"best_cn_error": best_cn.cn_error,
|
| 512 |
-
"population_size": len(mols),
|
| 513 |
-
"avg_cn_error": avg_cn_err,
|
| 514 |
-
}
|
| 515 |
-
|
| 516 |
-
print_msg = (f"Gen {generation}/{self.config.generations} | "
|
| 517 |
-
f"Pop {len(mols)} | "
|
| 518 |
-
f"Best CN err: {best_cn.cn_error:.3f} | "
|
| 519 |
-
f"Avg CN err: {avg_cn_err:.3f}")
|
| 520 |
-
|
| 521 |
-
if self.config.minimize_ysi:
|
| 522 |
-
front = self.population.pareto_front()
|
| 523 |
-
best_ysi = min(mols, key=lambda m: m.ysi)
|
| 524 |
-
avg_ysi = np.mean([m.ysi for m in mols])
|
| 525 |
-
|
| 526 |
-
log_dict.update({
|
| 527 |
-
"best_ysi": best_ysi.ysi,
|
| 528 |
-
"pareto_size": len(front),
|
| 529 |
-
"avg_ysi": avg_ysi,
|
| 530 |
-
})
|
| 531 |
-
|
| 532 |
-
print_msg += (f" | Best YSI: {best_ysi.ysi:.3f} | "
|
| 533 |
-
f"Avg YSI: {avg_ysi:.3f} | "
|
| 534 |
-
f"Pareto size: {len(front)}")
|
| 535 |
-
|
| 536 |
-
print(print_msg)
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
def _generate_offspring(self, survivors: List[Molecule]) -> List[Molecule]:
|
| 540 |
-
"""Generate offspring from survivors with parallel mutation."""
|
| 541 |
-
target_count = self.config.population_size - len(survivors)
|
| 542 |
-
max_attempts = target_count * self.config.max_offspring_attempts
|
| 543 |
-
|
| 544 |
-
# Generate parent pool
|
| 545 |
-
parents = [random.choice(survivors) for _ in range(max_attempts)]
|
| 546 |
-
parent_mols = [p.get_mol() for p in parents] # Use cached Mol objects
|
| 547 |
-
parent_mols = [m for m in parent_mols if m is not None]
|
| 548 |
-
|
| 549 |
-
# Parallel mutation generation
|
| 550 |
-
print(f" → Generating mutations in parallel ({len(parent_mols)} parents)...")
|
| 551 |
-
all_children_nested = Parallel(n_jobs=self.config.n_jobs, batch_size=10)(
|
| 552 |
-
delayed(self._mutate_molecule)(mol) for mol in parent_mols
|
| 553 |
-
)
|
| 554 |
-
|
| 555 |
-
# Flatten and limit
|
| 556 |
-
all_children = [child for children in all_children_nested for child in children]
|
| 557 |
-
all_children = all_children[:target_count * 3] # Reasonable limit
|
| 558 |
-
|
| 559 |
-
# Batch evaluation
|
| 560 |
-
if all_children:
|
| 561 |
-
print(f" → Evaluating {len(all_children)} offspring...")
|
| 562 |
-
new_molecules = self._create_molecules(all_children)
|
| 563 |
-
all_children.clear() # Explicit memory cleanup
|
| 564 |
-
return new_molecules
|
| 565 |
-
|
| 566 |
-
return []
|
| 567 |
-
|
| 568 |
-
def _run_evolution_loop(self):
|
| 569 |
-
"""Run the main evolution loop with progress tracking."""
|
| 570 |
-
for gen in tqdm(range(1, self.config.generations + 1), desc="Evolution"):
|
| 571 |
-
self._log_generation_stats(gen)
|
| 572 |
-
|
| 573 |
-
survivors = self.population.get_survivors()
|
| 574 |
-
offspring = self._generate_offspring(survivors)
|
| 575 |
-
|
| 576 |
-
# Create new population
|
| 577 |
-
new_pop = Population(self.config)
|
| 578 |
-
new_pop.add_molecules(survivors + offspring)
|
| 579 |
-
self.population = new_pop
|
| 580 |
-
|
| 581 |
-
def _generate_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 582 |
-
"""Generate final results DataFrames."""
|
| 583 |
-
final_df = self.population.to_dataframe()
|
| 584 |
-
|
| 585 |
-
if self.config.minimize_ysi and "ysi" in final_df.columns:
|
| 586 |
-
final_df = final_df[
|
| 587 |
-
(final_df["cn_error"] < 5) &
|
| 588 |
-
(final_df["ysi"] < 50)
|
| 589 |
-
].sort_values(["cn_error", "ysi"], ascending=True)
|
| 590 |
-
|
| 591 |
-
# overwrite rank safely
|
| 592 |
-
final_df["rank"] = range(1, len(final_df) + 1)
|
| 593 |
-
|
| 594 |
-
if self.config.minimize_ysi:
|
| 595 |
-
pareto_mols = self.population.pareto_front()
|
| 596 |
-
pareto_df = pd.DataFrame([m.to_dict() for m in pareto_mols])
|
| 597 |
-
|
| 598 |
-
if not pareto_df.empty:
|
| 599 |
-
pareto_df = pareto_df[
|
| 600 |
-
(pareto_df['cn_error'] < 5) & (pareto_df['ysi'] < 50)
|
| 601 |
-
].sort_values(["cn_error", "ysi"], ascending=True)
|
| 602 |
-
pareto_df.insert(0, 'rank', range(1, len(pareto_df) + 1))
|
| 603 |
-
else:
|
| 604 |
-
pareto_df = pd.DataFrame()
|
| 605 |
-
|
| 606 |
-
return final_df, pareto_df
|
| 607 |
-
|
| 608 |
-
def evolve(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 609 |
-
"""Run the evolutionary algorithm."""
|
| 610 |
-
# Initialize
|
| 611 |
-
init_smiles = df["SMILES"].sample(200, random_state=42).tolist()
|
| 612 |
-
init_count = self.initialize_population(init_smiles)
|
| 613 |
-
|
| 614 |
-
if init_count == 0:
|
| 615 |
-
print("❌ No valid initial molecules")
|
| 616 |
-
return pd.DataFrame(), pd.DataFrame()
|
| 617 |
-
|
| 618 |
-
print(f"✓ Initial population size: {init_count}")
|
| 619 |
-
|
| 620 |
-
# Evolution
|
| 621 |
-
self._run_evolution_loop()
|
| 622 |
-
|
| 623 |
-
# Results
|
| 624 |
-
return self._generate_results()
|
| 625 |
-
|
| 626 |
-
|
| 627 |
-
def get_user_config() -> EvolutionConfig:
|
| 628 |
-
"""Get configuration from user input."""
|
| 629 |
-
print("\n" + "="*70)
|
| 630 |
-
print("MOLECULAR EVOLUTION WITH GENETIC ALGORITHM (OPTIMIZED)")
|
| 631 |
-
print("="*70)
|
| 632 |
-
|
| 633 |
-
while True:
|
| 634 |
-
target = float(input("Enter target CN: ") or "50")
|
| 635 |
-
if target > 40:
|
| 636 |
-
break
|
| 637 |
-
print("⚠️ Target CN is too low, optimization may be challenging.")
|
| 638 |
-
print("Consider using a higher target CN for better results.\n")
|
| 639 |
-
|
| 640 |
-
minimize_ysi = input("Minimise YSI (y/n): ").strip().lower() in ['y', 'yes']
|
| 641 |
-
|
| 642 |
-
return EvolutionConfig(target_cn=target, minimize_ysi=minimize_ysi)
|
| 643 |
-
|
| 644 |
-
|
| 645 |
-
def save_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
|
| 646 |
-
"""Save results to CSV files."""
|
| 647 |
-
results_dir = Path("results")
|
| 648 |
-
results_dir.mkdir(exist_ok=True)
|
| 649 |
-
|
| 650 |
-
final_df.to_csv(results_dir / "final_population.csv", index=False)
|
| 651 |
-
if minimize_ysi and not pareto_df.empty:
|
| 652 |
-
pareto_df.to_csv(results_dir / "pareto_front.csv", index=False)
|
| 653 |
-
|
| 654 |
-
print("\n✓ Saved to results/")
|
| 655 |
-
|
| 656 |
-
|
| 657 |
-
def display_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
|
| 658 |
-
"""Display results to console."""
|
| 659 |
-
cols = (["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"])
|
| 660 |
-
|
| 661 |
-
print("\n=== TOP 10 (sorted) ===")
|
| 662 |
-
print(final_df.head(10)[cols].to_string(index=False))
|
| 663 |
-
|
| 664 |
-
if minimize_ysi and not pareto_df.empty:
|
| 665 |
-
print("\n=== PARETO FRONT (ranked) ===")
|
| 666 |
-
print(pareto_df[["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"]]
|
| 667 |
-
.head(20).to_string(index=False))
|
| 668 |
-
|
| 669 |
-
|
| 670 |
-
def main():
|
| 671 |
-
"""Main execution function with optional profiling."""
|
| 672 |
-
import cProfile
|
| 673 |
-
import pstats
|
| 674 |
-
|
| 675 |
-
config = get_user_config()
|
| 676 |
-
|
| 677 |
-
# Optional profiling
|
| 678 |
-
profiler = None
|
| 679 |
-
if os.environ.get('PROFILE'):
|
| 680 |
-
profiler = cProfile.Profile()
|
| 681 |
-
profiler.enable()
|
| 682 |
-
|
| 683 |
-
project_name = "cetane-ysi-pareto" if config.minimize_ysi else "cetane-optimization"
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
evolution = MolecularEvolution(config)
|
| 687 |
-
final_df, pareto_df = evolution.evolve()
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
# Display and save results
|
| 691 |
-
display_results(final_df, pareto_df, config.minimize_ysi)
|
| 692 |
-
save_results(final_df, pareto_df, config.minimize_ysi)
|
| 693 |
-
|
| 694 |
-
# Print profiling stats if enabled
|
| 695 |
-
if profiler:
|
| 696 |
-
profiler.disable()
|
| 697 |
-
stats = pstats.Stats(profiler)
|
| 698 |
-
stats.sort_stats('cumulative')
|
| 699 |
-
print("\n=== PROFILING STATS (Top 20) ===")
|
| 700 |
-
stats.print_stats(20)
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
if __name__ == "__main__":
|
| 704 |
-
main()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/model_config.py
DELETED
|
@@ -1,53 +0,0 @@
|
|
| 1 |
-
"""
|
| 2 |
-
Model configuration for loading from Hugging Face Hub.
|
| 3 |
-
|
| 4 |
-
Instructions:
|
| 5 |
-
1. Upload your models to Hugging Face
|
| 6 |
-
2. Update the repo IDs below with your actual repository names
|
| 7 |
-
3. Set USE_LOCAL_MODELS=false in your environment (default)
|
| 8 |
-
"""
|
| 9 |
-
|
| 10 |
-
import os
|
| 11 |
-
|
| 12 |
-
# Toggle between local files and HF Hub
|
| 13 |
-
# Set to 'true' for local development, 'false' for deployment
|
| 14 |
-
USE_LOCAL_MODELS = os.getenv('USE_LOCAL_MODELS', 'false').lower() == 'true'
|
| 15 |
-
|
| 16 |
-
# ============================================================================
|
| 17 |
-
# HUGGING FACE MODEL REPOSITORIES
|
| 18 |
-
# ============================================================================
|
| 19 |
-
# Update these with your actual Hugging Face repository IDs
|
| 20 |
-
# Format: "username/repo-name" or "organization/repo-name"
|
| 21 |
-
|
| 22 |
-
HF_MODEL_REPOS = {
|
| 23 |
-
'cn': "SalZa2004/Cetane_Number_Predictor", # Example: "john-doe/cetane-predictor"
|
| 24 |
-
'ysi': "SalZa2004/YSI_Predictor", # Example: "john-doe/ysi-predictor"
|
| 25 |
-
'bp': "SalZa2004/BP_Predictor", # Example: "john-doe/bp-predictor"
|
| 26 |
-
'density': "SalZa2004/Density_Predictor", # Example: "john-doe/density-predictor"
|
| 27 |
-
'lhv': "SalZa2004/LHV_Predictor", # Example: "john-doe/lhv-predictor"
|
| 28 |
-
}
|
| 29 |
-
|
| 30 |
-
# ============================================================================
|
| 31 |
-
# VALIDATION
|
| 32 |
-
# ============================================================================
|
| 33 |
-
|
| 34 |
-
def validate_config():
|
| 35 |
-
"""Validate that configuration is properly set up."""
|
| 36 |
-
if not USE_LOCAL_MODELS:
|
| 37 |
-
# Check if HF repos are configured
|
| 38 |
-
for prop, repo in HF_MODEL_REPOS.items():
|
| 39 |
-
if repo == f"SalZa2004/{prop}-predictor":
|
| 40 |
-
print(f"⚠️ Warning: {prop} model repo not configured!")
|
| 41 |
-
print(f" Update HF_MODEL_REPOS['{prop}'] in model_config.py")
|
| 42 |
-
return False
|
| 43 |
-
return True
|
| 44 |
-
|
| 45 |
-
# Run validation on import
|
| 46 |
-
if __name__ != "__main__":
|
| 47 |
-
if not validate_config() and not USE_LOCAL_MODELS:
|
| 48 |
-
print("\n" + "="*70)
|
| 49 |
-
print("❌ MODEL CONFIGURATION INCOMPLETE")
|
| 50 |
-
print("="*70)
|
| 51 |
-
print("\nPlease update model_config.py with your Hugging Face repository IDs.")
|
| 52 |
-
print("Example: HF_MODEL_REPOS['cn'] = 'john-doe/cetane-predictor'")
|
| 53 |
-
print("="*70 + "\n")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/shared_features.py
DELETED
|
@@ -1,233 +0,0 @@
|
|
| 1 |
-
import os
|
| 2 |
-
import sqlite3
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import numpy as np
|
| 5 |
-
from sklearn.model_selection import train_test_split
|
| 6 |
-
|
| 7 |
-
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
|
| 8 |
-
DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
|
| 9 |
-
from functools import lru_cache
|
| 10 |
-
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
def load_raw_data():
|
| 15 |
-
"""Load raw data from database."""
|
| 16 |
-
print("Connecting to SQLite database...")
|
| 17 |
-
conn = sqlite3.connect(DB_PATH)
|
| 18 |
-
|
| 19 |
-
query = """
|
| 20 |
-
SELECT
|
| 21 |
-
F.Fuel_Name,
|
| 22 |
-
F.SMILES,
|
| 23 |
-
T.Standardised_DCN AS cn
|
| 24 |
-
FROM FUEL F
|
| 25 |
-
LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
|
| 26 |
-
"""
|
| 27 |
-
df = pd.read_sql_query(query, conn)
|
| 28 |
-
conn.close()
|
| 29 |
-
|
| 30 |
-
# Clean data
|
| 31 |
-
df.dropna(subset=["cn", "SMILES"], inplace=True)
|
| 32 |
-
|
| 33 |
-
return df
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
# ============================================================================
|
| 37 |
-
# 2. FEATURIZATION MODULE
|
| 38 |
-
# ============================================================================
|
| 39 |
-
from rdkit import Chem
|
| 40 |
-
from rdkit.Chem import Descriptors, rdFingerprintGenerator
|
| 41 |
-
from tqdm import tqdm
|
| 42 |
-
|
| 43 |
-
# Get descriptor names globally
|
| 44 |
-
DESCRIPTOR_NAMES = [d[0] for d in Descriptors._descList]
|
| 45 |
-
desc_functions = [d[1] for d in Descriptors._descList]
|
| 46 |
-
|
| 47 |
-
def morgan_fp_from_mol(mol, radius=2, n_bits=2048):
|
| 48 |
-
"""Generate Morgan fingerprint."""
|
| 49 |
-
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
|
| 50 |
-
fp = fpgen.GetFingerprint(mol)
|
| 51 |
-
arr = np.array(list(fp.ToBitString()), dtype=int)
|
| 52 |
-
return arr
|
| 53 |
-
|
| 54 |
-
def physchem_desc_from_mol(mol):
|
| 55 |
-
"""Calculate physicochemical descriptors."""
|
| 56 |
-
try:
|
| 57 |
-
desc = np.array([fn(mol) for fn in desc_functions], dtype=np.float32)
|
| 58 |
-
desc = np.nan_to_num(desc, nan=0.0, posinf=0.0, neginf=0.0)
|
| 59 |
-
return desc
|
| 60 |
-
except:
|
| 61 |
-
return None
|
| 62 |
-
|
| 63 |
-
def featurize(smiles):
|
| 64 |
-
"""Convert SMILES to feature vector."""
|
| 65 |
-
mol = Chem.MolFromSmiles(smiles)
|
| 66 |
-
if mol is None:
|
| 67 |
-
return None
|
| 68 |
-
|
| 69 |
-
fp = morgan_fp_from_mol(mol)
|
| 70 |
-
desc = physchem_desc_from_mol(mol)
|
| 71 |
-
|
| 72 |
-
if fp is None or desc is None:
|
| 73 |
-
return None
|
| 74 |
-
|
| 75 |
-
return np.hstack([fp, desc])
|
| 76 |
-
|
| 77 |
-
def featurize_df(df, smiles_col="SMILES", return_df=True):
|
| 78 |
-
"""
|
| 79 |
-
Featurize a DataFrame or list of SMILES (vectorized for speed).
|
| 80 |
-
"""
|
| 81 |
-
# Handle different input types
|
| 82 |
-
if isinstance(df, (list, np.ndarray)):
|
| 83 |
-
df = pd.DataFrame({smiles_col: df})
|
| 84 |
-
elif isinstance(df, pd.Series):
|
| 85 |
-
df = pd.DataFrame({smiles_col: df})
|
| 86 |
-
|
| 87 |
-
# Convert all SMILES to molecules in batch
|
| 88 |
-
mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_col]]
|
| 89 |
-
|
| 90 |
-
features = []
|
| 91 |
-
valid_indices = []
|
| 92 |
-
|
| 93 |
-
# Process valid molecules
|
| 94 |
-
for i, mol in enumerate(tqdm(mols, desc="Featurizing")):
|
| 95 |
-
if mol is None:
|
| 96 |
-
continue
|
| 97 |
-
|
| 98 |
-
try:
|
| 99 |
-
fp = morgan_fp_from_mol(mol)
|
| 100 |
-
desc = physchem_desc_from_mol(mol)
|
| 101 |
-
|
| 102 |
-
if fp is not None and desc is not None:
|
| 103 |
-
features.append(np.hstack([fp, desc]))
|
| 104 |
-
valid_indices.append(i)
|
| 105 |
-
except:
|
| 106 |
-
continue
|
| 107 |
-
|
| 108 |
-
if len(features) == 0:
|
| 109 |
-
return (None, None) if return_df else None
|
| 110 |
-
|
| 111 |
-
X = np.vstack(features)
|
| 112 |
-
|
| 113 |
-
if return_df:
|
| 114 |
-
df_valid = df.iloc[valid_indices].reset_index(drop=True)
|
| 115 |
-
return X, df_valid
|
| 116 |
-
else:
|
| 117 |
-
return X
|
| 118 |
-
@lru_cache(maxsize=50_000)
|
| 119 |
-
|
| 120 |
-
def cached_featurize_smiles(smiles: str):
|
| 121 |
-
X = featurize_df([smiles], return_df=False)
|
| 122 |
-
if X is None:
|
| 123 |
-
return None
|
| 124 |
-
return X[0] # single feature vector
|
| 125 |
-
|
| 126 |
-
# ============================================================================
|
| 127 |
-
# 3. FEATURE SELECTOR CLASS
|
| 128 |
-
# ============================================================================
|
| 129 |
-
import joblib
|
| 130 |
-
|
| 131 |
-
class FeatureSelector:
|
| 132 |
-
"""Feature selection pipeline that can be saved and reused."""
|
| 133 |
-
|
| 134 |
-
def __init__(self, n_morgan=2048, corr_threshold=0.95, top_k=300):
|
| 135 |
-
self.n_morgan = n_morgan
|
| 136 |
-
self.corr_threshold = corr_threshold
|
| 137 |
-
self.top_k = top_k
|
| 138 |
-
|
| 139 |
-
# Filled during fit()
|
| 140 |
-
self.corr_cols_to_drop = None
|
| 141 |
-
self.selected_indices = None
|
| 142 |
-
self.is_fitted = False
|
| 143 |
-
|
| 144 |
-
def fit(self, X, y):
|
| 145 |
-
"""Fit the feature selector on training data."""
|
| 146 |
-
print("\n" + "="*70)
|
| 147 |
-
print("FITTING FEATURE SELECTOR")
|
| 148 |
-
print("="*70)
|
| 149 |
-
|
| 150 |
-
# Step 1: Split Morgan and descriptors
|
| 151 |
-
X_mfp = X[:, :self.n_morgan]
|
| 152 |
-
X_desc = X[:, self.n_morgan:]
|
| 153 |
-
|
| 154 |
-
print(f"Morgan fingerprints: {X_mfp.shape[1]}")
|
| 155 |
-
print(f"Descriptors: {X_desc.shape[1]}")
|
| 156 |
-
|
| 157 |
-
# Step 2: Remove correlated descriptors
|
| 158 |
-
desc_df = pd.DataFrame(X_desc)
|
| 159 |
-
corr_matrix = desc_df.corr().abs()
|
| 160 |
-
upper = corr_matrix.where(
|
| 161 |
-
np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
|
| 162 |
-
)
|
| 163 |
-
|
| 164 |
-
self.corr_cols_to_drop = [
|
| 165 |
-
col for col in upper.columns if any(upper[col] > self.corr_threshold)
|
| 166 |
-
]
|
| 167 |
-
|
| 168 |
-
print(f"Correlated descriptors removed: {len(self.corr_cols_to_drop)}")
|
| 169 |
-
|
| 170 |
-
desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
|
| 171 |
-
X_corr = np.hstack([X_mfp, desc_filtered])
|
| 172 |
-
|
| 173 |
-
print(f"Features after correlation filter: {X_corr.shape[1]}")
|
| 174 |
-
|
| 175 |
-
# Step 3: Feature importance selection
|
| 176 |
-
from sklearn.ensemble import ExtraTreesRegressor
|
| 177 |
-
|
| 178 |
-
print("Running feature importance selection...")
|
| 179 |
-
model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
|
| 180 |
-
model.fit(X_corr, y)
|
| 181 |
-
|
| 182 |
-
importances = model.feature_importances_
|
| 183 |
-
indices = np.argsort(importances)[::-1]
|
| 184 |
-
|
| 185 |
-
self.selected_indices = indices[:self.top_k]
|
| 186 |
-
|
| 187 |
-
print(f"Final selected features: {len(self.selected_indices)}")
|
| 188 |
-
|
| 189 |
-
self.is_fitted = True
|
| 190 |
-
return self
|
| 191 |
-
|
| 192 |
-
def transform(self, X):
|
| 193 |
-
"""Apply the fitted feature selection to new data."""
|
| 194 |
-
if not self.is_fitted:
|
| 195 |
-
raise RuntimeError("FeatureSelector must be fitted before transform!")
|
| 196 |
-
|
| 197 |
-
# Step 1: Split Morgan and descriptors
|
| 198 |
-
X_mfp = X[:, :self.n_morgan]
|
| 199 |
-
X_desc = X[:, self.n_morgan:]
|
| 200 |
-
|
| 201 |
-
# Step 2: Remove same correlated descriptors
|
| 202 |
-
desc_df = pd.DataFrame(X_desc)
|
| 203 |
-
desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
|
| 204 |
-
X_corr = np.hstack([X_mfp, desc_filtered])
|
| 205 |
-
|
| 206 |
-
# Step 3: Select same important features
|
| 207 |
-
X_selected = X_corr[:, self.selected_indices]
|
| 208 |
-
|
| 209 |
-
return X_selected
|
| 210 |
-
|
| 211 |
-
def fit_transform(self, X, y):
|
| 212 |
-
"""Fit and transform in one step."""
|
| 213 |
-
return self.fit(X, y).transform(X)
|
| 214 |
-
|
| 215 |
-
def save(self, filepath='feature_selector.joblib'):
|
| 216 |
-
"""Save the fitted selector."""
|
| 217 |
-
if not self.is_fitted:
|
| 218 |
-
raise RuntimeError("Cannot save unfitted selector!")
|
| 219 |
-
|
| 220 |
-
# Create directory if it doesn't exist
|
| 221 |
-
os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
|
| 222 |
-
|
| 223 |
-
joblib.dump(self, filepath)
|
| 224 |
-
print(f"✓ Feature selector saved to {filepath}")
|
| 225 |
-
|
| 226 |
-
@staticmethod
|
| 227 |
-
def load(filepath='feature_selector.joblib'):
|
| 228 |
-
"""Load a fitted selector."""
|
| 229 |
-
selector = joblib.load(filepath)
|
| 230 |
-
if not selector.is_fitted:
|
| 231 |
-
raise RuntimeError("Loaded selector is not fitted!")
|
| 232 |
-
print(f"✓ Feature selector loaded from {filepath}")
|
| 233 |
-
return selector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/streamlit_app.py
DELETED
|
@@ -1,161 +0,0 @@
|
|
| 1 |
-
import streamlit as st
|
| 2 |
-
from pathlib import Path
|
| 3 |
-
import pandas as pd
|
| 4 |
-
import os
|
| 5 |
-
from shared_features import FeatureSelector, featurize_df
|
| 6 |
-
# -----------------------------
|
| 7 |
-
# OPTIONAL: Disable wandb on HF
|
| 8 |
-
# -----------------------------
|
| 9 |
-
os.environ["WANDB_MODE"] = "disabled"
|
| 10 |
-
|
| 11 |
-
# -----------------------------
|
| 12 |
-
# Import your existing code
|
| 13 |
-
# -----------------------------
|
| 14 |
-
from dataclasses import asdict
|
| 15 |
-
from main import (
|
| 16 |
-
EvolutionConfig,
|
| 17 |
-
MolecularEvolution
|
| 18 |
-
)
|
| 19 |
-
|
| 20 |
-
# -----------------------------
|
| 21 |
-
# Page config
|
| 22 |
-
# -----------------------------
|
| 23 |
-
st.set_page_config(
|
| 24 |
-
page_title="Molecular Evolution Optimizer",
|
| 25 |
-
layout="wide"
|
| 26 |
-
)
|
| 27 |
-
|
| 28 |
-
st.title("🧬 Molecular Evolution for Cetane Optimization")
|
| 29 |
-
st.markdown(
|
| 30 |
-
"""
|
| 31 |
-
This app runs a **genetic algorithm** to evolve molecules towards a
|
| 32 |
-
**target Cetane Number (CN)**, optionally minimizing **YSI** and
|
| 33 |
-
enforcing physical constraints.
|
| 34 |
-
"""
|
| 35 |
-
)
|
| 36 |
-
|
| 37 |
-
# -----------------------------
|
| 38 |
-
# Sidebar: Configuration
|
| 39 |
-
# -----------------------------
|
| 40 |
-
st.sidebar.header("⚙️ Evolution Configuration")
|
| 41 |
-
|
| 42 |
-
target_cn = st.sidebar.slider(
|
| 43 |
-
"Target Cetane Number (CN)",
|
| 44 |
-
min_value=40.0,
|
| 45 |
-
max_value=80.0,
|
| 46 |
-
value=50.0,
|
| 47 |
-
step=1.0
|
| 48 |
-
)
|
| 49 |
-
|
| 50 |
-
minimize_ysi = st.sidebar.checkbox(
|
| 51 |
-
"Minimize YSI",
|
| 52 |
-
value=True
|
| 53 |
-
)
|
| 54 |
-
|
| 55 |
-
generations = st.sidebar.slider(
|
| 56 |
-
"Generations",
|
| 57 |
-
min_value=1,
|
| 58 |
-
max_value=20,
|
| 59 |
-
value=6
|
| 60 |
-
)
|
| 61 |
-
|
| 62 |
-
population_size = st.sidebar.slider(
|
| 63 |
-
"Population Size",
|
| 64 |
-
min_value=10,
|
| 65 |
-
max_value=200,
|
| 66 |
-
value=50,
|
| 67 |
-
step=10
|
| 68 |
-
)
|
| 69 |
-
|
| 70 |
-
survivor_fraction = st.sidebar.slider(
|
| 71 |
-
"Survivor Fraction",
|
| 72 |
-
min_value=0.1,
|
| 73 |
-
max_value=0.9,
|
| 74 |
-
value=0.5,
|
| 75 |
-
step=0.05
|
| 76 |
-
)
|
| 77 |
-
|
| 78 |
-
st.sidebar.subheader("🔬 Property Filters")
|
| 79 |
-
|
| 80 |
-
use_bp_filter = st.sidebar.checkbox("Use Boiling Point filter", True)
|
| 81 |
-
use_density_filter = st.sidebar.checkbox("Use Density filter", True)
|
| 82 |
-
use_lhv_filter = st.sidebar.checkbox("Use LHV filter", True)
|
| 83 |
-
use_dv_filter = st.sidebar.checkbox("Use Dynamic Viscosity filter", True)
|
| 84 |
-
|
| 85 |
-
# -----------------------------
|
| 86 |
-
# Build config
|
| 87 |
-
# -----------------------------
|
| 88 |
-
config = EvolutionConfig(
|
| 89 |
-
target_cn=target_cn,
|
| 90 |
-
minimize_ysi=minimize_ysi,
|
| 91 |
-
generations=generations,
|
| 92 |
-
population_size=population_size,
|
| 93 |
-
survivor_fraction=survivor_fraction,
|
| 94 |
-
use_bp_filter=use_bp_filter,
|
| 95 |
-
use_density_filter=use_density_filter,
|
| 96 |
-
use_lhv_filter=use_lhv_filter,
|
| 97 |
-
use_dynamic_viscosity_filter=use_dv_filter,
|
| 98 |
-
)
|
| 99 |
-
|
| 100 |
-
# -----------------------------
|
| 101 |
-
# Run button
|
| 102 |
-
# -----------------------------
|
| 103 |
-
run = st.button("🚀 Run Evolution")
|
| 104 |
-
|
| 105 |
-
if run:
|
| 106 |
-
with st.spinner("Running molecular evolution... This may take several minutes."):
|
| 107 |
-
|
| 108 |
-
evolution = MolecularEvolution(config)
|
| 109 |
-
final_df, pareto_df = evolution.evolve()
|
| 110 |
-
|
| 111 |
-
st.success("Evolution completed!")
|
| 112 |
-
|
| 113 |
-
# -----------------------------
|
| 114 |
-
# Results: Final population
|
| 115 |
-
# -----------------------------
|
| 116 |
-
st.header("📊 Final Population")
|
| 117 |
-
|
| 118 |
-
if final_df.empty:
|
| 119 |
-
st.warning("No valid molecules found.")
|
| 120 |
-
else:
|
| 121 |
-
st.dataframe(final_df, use_container_width=True)
|
| 122 |
-
|
| 123 |
-
csv = final_df.to_csv(index=False).encode("utf-8")
|
| 124 |
-
st.download_button(
|
| 125 |
-
"⬇️ Download Final Population CSV",
|
| 126 |
-
csv,
|
| 127 |
-
"final_population.csv",
|
| 128 |
-
"text/csv"
|
| 129 |
-
)
|
| 130 |
-
|
| 131 |
-
# -----------------------------
|
| 132 |
-
# Results: Pareto front
|
| 133 |
-
# -----------------------------
|
| 134 |
-
if minimize_ysi:
|
| 135 |
-
st.header("🏆 Pareto Front")
|
| 136 |
-
|
| 137 |
-
if pareto_df.empty:
|
| 138 |
-
st.info("No Pareto-optimal molecules found.")
|
| 139 |
-
else:
|
| 140 |
-
st.dataframe(pareto_df, use_container_width=True)
|
| 141 |
-
|
| 142 |
-
pareto_csv = pareto_df.to_csv(index=False).encode("utf-8")
|
| 143 |
-
st.download_button(
|
| 144 |
-
"⬇️ Download Pareto Front CSV",
|
| 145 |
-
pareto_csv,
|
| 146 |
-
"pareto_front.csv",
|
| 147 |
-
"text/csv"
|
| 148 |
-
)
|
| 149 |
-
|
| 150 |
-
# -----------------------------
|
| 151 |
-
# Quick plots
|
| 152 |
-
# -----------------------------
|
| 153 |
-
if not final_df.empty:
|
| 154 |
-
st.header("📈 CN Error vs YSI")
|
| 155 |
-
|
| 156 |
-
if "ysi" in final_df.columns:
|
| 157 |
-
st.scatter_chart(
|
| 158 |
-
final_df,
|
| 159 |
-
x="cn_error",
|
| 160 |
-
y="ysi"
|
| 161 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|