Spaces:
Build error
Build error
updated applications
Browse files- applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc +0 -0
- applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc +0 -0
- applications/3_molecule_generator/__pycache__/main.cpython-310.pyc +0 -0
- applications/3_molecule_generator/__pycache__/results.cpython-310.pyc +0 -0
- applications/__init__.py +0 -0
- applications/docker/.dockerignore +0 -5
- applications/docker/Dockerfile +0 -33
- applications/docker/docker-compose.yml +0 -22
- applications/mixture_aware_generator/__init__.py +0 -0
- applications/mixture_predictor/__init__.py +0 -0
- applications/molecule_generator/__init__.py +0 -0
- applications/molecule_generator/cli.py +43 -0
- applications/molecule_generator/main.py +34 -0
- applications/molecule_generator/results.py +37 -0
- applications/molecule_generator/results/final_population.csv +6 -0
- applications/molecule_generator/results/pareto_front.csv +3 -0
- applications/pure_predictor/__init__.py +0 -0
- applications/pure_predictor/cli.py +29 -0
- applications/pure_predictor/main.py +82 -0
- applications/pure_predictor/results.py +11 -0
- core/__init__.py +0 -0
- core/__pycache__/config.cpython-310.pyc +0 -0
- core/blending/__init__.py +0 -0
- core/config.py +26 -0
- core/data_prep.py +34 -0
- core/evolution/__init__.py +0 -0
- core/evolution/__pycache__/evolution.cpython-310.pyc +0 -0
- core/evolution/evolution.py +234 -0
- core/evolution/molecule.py +33 -0
- core/evolution/population.py +86 -0
- core/predictors/__init__.py +0 -0
- core/predictors/mixture/__init__.py +0 -0
- core/predictors/pure_component/__pycache__/generic.cpython-310.pyc +0 -0
- core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc +0 -0
- core/predictors/pure_component/generic.py +51 -0
- core/predictors/pure_component/hf_models.py +17 -0
- core/predictors/pure_component/property_predictor.py +77 -0
- core/shared_features.py +223 -0
applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc
DELETED
|
Binary file (143 Bytes)
|
|
|
applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc
DELETED
|
Binary file (1.76 kB)
|
|
|
applications/3_molecule_generator/__pycache__/main.cpython-310.pyc
DELETED
|
Binary file (918 Bytes)
|
|
|
applications/3_molecule_generator/__pycache__/results.cpython-310.pyc
DELETED
|
Binary file (1.85 kB)
|
|
|
applications/__init__.py
ADDED
|
File without changes
|
applications/docker/.dockerignore
DELETED
|
@@ -1,5 +0,0 @@
|
|
| 1 |
-
venv*
|
| 2 |
-
__pycache__/
|
| 3 |
-
*.pyc
|
| 4 |
-
.git/
|
| 5 |
-
.gitignore
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
applications/docker/Dockerfile
DELETED
|
@@ -1,33 +0,0 @@
|
|
| 1 |
-
FROM python:3.10-slim
|
| 2 |
-
|
| 3 |
-
# Avoid interactive prompts
|
| 4 |
-
ENV DEBIAN_FRONTEND=noninteractive
|
| 5 |
-
|
| 6 |
-
# System deps (important for RDKit / ML)
|
| 7 |
-
RUN apt-get update && apt-get install -y \
|
| 8 |
-
git \
|
| 9 |
-
git-lfs \
|
| 10 |
-
build-essential \
|
| 11 |
-
sqlite3 \
|
| 12 |
-
&& rm -rf /var/lib/apt/lists/*
|
| 13 |
-
|
| 14 |
-
# Install git-lfs
|
| 15 |
-
RUN git lfs install
|
| 16 |
-
|
| 17 |
-
# Set working directory
|
| 18 |
-
WORKDIR /app
|
| 19 |
-
|
| 20 |
-
# Copy dependency files first (better caching)
|
| 21 |
-
COPY requirements.txt .
|
| 22 |
-
|
| 23 |
-
RUN pip install --upgrade pip setuptools wheel \
|
| 24 |
-
&& pip install -r requirements.txt
|
| 25 |
-
|
| 26 |
-
# Copy the rest of the project
|
| 27 |
-
COPY . .
|
| 28 |
-
|
| 29 |
-
# Editable install
|
| 30 |
-
RUN pip install -e .
|
| 31 |
-
|
| 32 |
-
# Default command (can override)
|
| 33 |
-
CMD ["bash"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
applications/docker/docker-compose.yml
DELETED
|
@@ -1,22 +0,0 @@
|
|
| 1 |
-
services:
|
| 2 |
-
biofuel-ml:
|
| 3 |
-
build:
|
| 4 |
-
context: ..
|
| 5 |
-
dockerfile: docker/Dockerfile
|
| 6 |
-
image: biofuel-ml:latest
|
| 7 |
-
container_name: biofuel-ml
|
| 8 |
-
tty: true
|
| 9 |
-
stdin_open: true
|
| 10 |
-
|
| 11 |
-
volumes:
|
| 12 |
-
- ..:/app
|
| 13 |
-
- ~/.cache/huggingface:/root/.cache/huggingface
|
| 14 |
-
|
| 15 |
-
working_dir: /app
|
| 16 |
-
|
| 17 |
-
environment:
|
| 18 |
-
- PYTHONUNBUFFERED=1
|
| 19 |
-
- HF_HOME=/root/.cache/huggingface
|
| 20 |
-
- PYTHONHASHSEED=42
|
| 21 |
-
|
| 22 |
-
command: bash
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
applications/mixture_aware_generator/__init__.py
ADDED
|
File without changes
|
applications/mixture_predictor/__init__.py
ADDED
|
File without changes
|
applications/molecule_generator/__init__.py
ADDED
|
File without changes
|
applications/molecule_generator/cli.py
ADDED
|
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from core.config import EvolutionConfig
|
| 2 |
+
|
| 3 |
+
def get_user_config() -> EvolutionConfig:
|
| 4 |
+
"""Get configuration from user input."""
|
| 5 |
+
print("\n" + "="*70)
|
| 6 |
+
print("MOLECULAR EVOLUTION WITH GENETIC ALGORITHM")
|
| 7 |
+
print("="*70)
|
| 8 |
+
|
| 9 |
+
# Choose optimization mode
|
| 10 |
+
print("\nOptimization Mode:")
|
| 11 |
+
print("1. Target a specific CN value (minimize error from target)")
|
| 12 |
+
print("2. Maximize CN (find highest possible CN)")
|
| 13 |
+
mode = input("Select mode (1 or 2): ").strip()
|
| 14 |
+
|
| 15 |
+
maximize_cn = (mode == "2")
|
| 16 |
+
while mode not in ["1", "2"]:
|
| 17 |
+
print("Invalid selection. Please choose 1 or 2.")
|
| 18 |
+
mode = input("Select mode (1 or 2): ").strip()
|
| 19 |
+
maximize_cn = (mode == "2")
|
| 20 |
+
if maximize_cn:
|
| 21 |
+
print("\n✓ Mode: Maximize Cetane Number")
|
| 22 |
+
target = 100.0 # Dummy target, not used in maximize mode
|
| 23 |
+
else:
|
| 24 |
+
print("\n✓ Mode: Target Cetane Number")
|
| 25 |
+
while True:
|
| 26 |
+
target = float(input("Enter target CN: ") or "50")
|
| 27 |
+
if target > 40:
|
| 28 |
+
break
|
| 29 |
+
print("⚠️ Target CN is too low, optimization may be challenging.")
|
| 30 |
+
print("Consider using a higher target CN for better results.\n")
|
| 31 |
+
|
| 32 |
+
# Ask about YSI
|
| 33 |
+
minimize_ysi = input("\nMinimize YSI (y/n): ").strip().lower() in ['y', 'yes']
|
| 34 |
+
|
| 35 |
+
# Print configuration summary
|
| 36 |
+
print("\n" + "="*70)
|
| 37 |
+
print("CONFIGURATION SUMMARY:")
|
| 38 |
+
print(f" • Mode: {'Maximize CN' if maximize_cn else f'Target CN = {target}'}")
|
| 39 |
+
print(f" • Minimize YSI: {'Yes' if minimize_ysi else 'No'}")
|
| 40 |
+
print(f" • Optimization: {'Multi-objective (CN + YSI)' if minimize_ysi else 'Single-objective (CN only)'}")
|
| 41 |
+
print("="*70 + "\n")
|
| 42 |
+
|
| 43 |
+
return EvolutionConfig(target_cn=target, maximize_cn=maximize_cn, minimize_ysi=minimize_ysi)
|
applications/molecule_generator/main.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
| 3 |
+
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
| 4 |
+
SEED = 42
|
| 5 |
+
|
| 6 |
+
import random, numpy as np
|
| 7 |
+
|
| 8 |
+
from .cli import get_user_config
|
| 9 |
+
from .results import display_results, save_results
|
| 10 |
+
from core.evolution.evolution import MolecularEvolution
|
| 11 |
+
from core.shared_features import FeatureSelector
|
| 12 |
+
|
| 13 |
+
os.environ["PYTHONHASHSEED"] = str(SEED)
|
| 14 |
+
random.seed(SEED)
|
| 15 |
+
np.random.seed(SEED)
|
| 16 |
+
|
| 17 |
+
def run(config):
|
| 18 |
+
evolution = MolecularEvolution(config)
|
| 19 |
+
return evolution.evolve()
|
| 20 |
+
|
| 21 |
+
def main():
|
| 22 |
+
|
| 23 |
+
"""Main execution function."""
|
| 24 |
+
config = get_user_config()
|
| 25 |
+
|
| 26 |
+
evolution = MolecularEvolution(config)
|
| 27 |
+
final_df, pareto_df = evolution.evolve()
|
| 28 |
+
|
| 29 |
+
# Display and save results
|
| 30 |
+
display_results(final_df, pareto_df, config)
|
| 31 |
+
save_results(final_df, pareto_df, config.minimize_ysi)
|
| 32 |
+
|
| 33 |
+
if __name__ == "__main__":
|
| 34 |
+
main()
|
applications/molecule_generator/results.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from core.config import EvolutionConfig
|
| 5 |
+
|
| 6 |
+
def save_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
|
| 7 |
+
"""Save results to CSV files."""
|
| 8 |
+
results_dir = Path("results")
|
| 9 |
+
results_dir.mkdir(exist_ok=True)
|
| 10 |
+
|
| 11 |
+
final_df.to_csv(results_dir / "final_population.csv", index=False)
|
| 12 |
+
if minimize_ysi and not pareto_df.empty:
|
| 13 |
+
pareto_df.to_csv(results_dir / "pareto_front.csv", index=False)
|
| 14 |
+
|
| 15 |
+
print("\n✓ Results saved to results/")
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def display_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, config: EvolutionConfig):
|
| 19 |
+
"""Display results to console."""
|
| 20 |
+
cols = ["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"]
|
| 21 |
+
|
| 22 |
+
if config.maximize_cn:
|
| 23 |
+
cols = [c for c in cols if c != "cn_error"]
|
| 24 |
+
|
| 25 |
+
available_cols = [c for c in cols if c in final_df.columns]
|
| 26 |
+
|
| 27 |
+
print("\n" + "="*70)
|
| 28 |
+
print("=== BEST CANDIDATES ===")
|
| 29 |
+
print("="*70)
|
| 30 |
+
print(final_df.head(10)[available_cols].to_string(index=False))
|
| 31 |
+
|
| 32 |
+
if config.minimize_ysi and not pareto_df.empty:
|
| 33 |
+
print("\n" + "="*70)
|
| 34 |
+
print("=== PARETO FRONT (Non-dominated solutions) ===")
|
| 35 |
+
print("="*70)
|
| 36 |
+
available_pareto_cols = [c for c in cols if c in pareto_df.columns]
|
| 37 |
+
print(pareto_df[available_pareto_cols].head(20).to_string(index=False))
|
applications/molecule_generator/results/final_population.csv
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rank,smiles,cn,cn_error,cn_score,ysi
|
| 2 |
+
1,CCCCCC(C)C(=O)O,55.49838163060194,0.49838163060194063,55.49838163060194,45.2672860656464
|
| 3 |
+
2,CCCCC=O,55.79958905600905,0.7995890560090473,55.79958905600905,23.523889793928337
|
| 4 |
+
3,CCCCC(CC)C(=O)O,57.088458252834485,2.088458252834485,57.088458252834485,44.53587682413441
|
| 5 |
+
4,CCCCC(C)C(=O)O,57.32411875680274,2.32411875680274,57.32411875680274,36.7278812007473
|
| 6 |
+
5,CCCCC(=O)O,57.959166968253996,2.959166968253996,57.959166968253996,24.625850459125378
|
applications/molecule_generator/results/pareto_front.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
rank,smiles,cn,cn_error,cn_score,ysi
|
| 2 |
+
1,CCCCCC(C)C(=O)O,55.49838163060194,0.49838163060194063,55.49838163060194,45.2672860656464
|
| 3 |
+
2,CCCCC=O,55.79958905600905,0.7995890560090473,55.79958905600905,23.523889793928337
|
applications/pure_predictor/__init__.py
ADDED
|
File without changes
|
applications/pure_predictor/cli.py
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# applications/1_pure_predictor/cli.py
|
| 2 |
+
|
| 3 |
+
from rdkit import Chem
|
| 4 |
+
from rdkit.Chem import rdinchi
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def get_user_config():
|
| 8 |
+
"""
|
| 9 |
+
Collect user inputs for pure-component property prediction.
|
| 10 |
+
SMILES-only input.
|
| 11 |
+
"""
|
| 12 |
+
|
| 13 |
+
mode = input("Select prediction mode (1: Single, 2: Batch): ").strip()
|
| 14 |
+
while mode not in {"1", "2"}:
|
| 15 |
+
print("Invalid selection. Please choose 1 or 2.")
|
| 16 |
+
mode = input("Select prediction mode (1: Single, 2: Batch): ").strip()
|
| 17 |
+
|
| 18 |
+
if mode == "1":
|
| 19 |
+
smiles = input("Enter SMILES string: ").strip()
|
| 20 |
+
if Chem.MolFromSmiles(smiles) is None:
|
| 21 |
+
raise ValueError("Invalid SMILES string.")
|
| 22 |
+
else:
|
| 23 |
+
smiles = input("Enter path to SMILES file: ").strip()
|
| 24 |
+
|
| 25 |
+
return {
|
| 26 |
+
"mode": mode,
|
| 27 |
+
"smiles": smiles
|
| 28 |
+
}
|
| 29 |
+
|
applications/pure_predictor/main.py
ADDED
|
@@ -0,0 +1,82 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# applications/1_pure_predictor/main.py
|
| 2 |
+
import os
|
| 3 |
+
os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
|
| 4 |
+
os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
|
| 5 |
+
|
| 6 |
+
from core.shared_features import featurize_df, FeatureSelector
|
| 7 |
+
from core.predictors.pure_component.generic import GenericPredictor
|
| 8 |
+
from core.predictors.pure_component.hf_models import load_models
|
| 9 |
+
|
| 10 |
+
from .cli import get_user_config
|
| 11 |
+
from .results import display_results
|
| 12 |
+
|
| 13 |
+
# Load model paths (local or HF)
|
| 14 |
+
PREDICTOR_PATHS = load_models()
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def run(config):
|
| 18 |
+
"""
|
| 19 |
+
Run pure-component property prediction.
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
smiles = config["smiles"]
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
# --- Featurize ONCE ---
|
| 26 |
+
X_full = featurize_df([smiles], return_df=False)
|
| 27 |
+
|
| 28 |
+
if X_full is None:
|
| 29 |
+
raise RuntimeError("Featurization failed for input SMILES.")
|
| 30 |
+
|
| 31 |
+
# --- Initialise predictors ---
|
| 32 |
+
cn_predictor = GenericPredictor(
|
| 33 |
+
PREDICTOR_PATHS["cn"],
|
| 34 |
+
"Cetane Number"
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
bp_predictor = GenericPredictor(
|
| 38 |
+
PREDICTOR_PATHS["bp"],
|
| 39 |
+
"Boiling Point"
|
| 40 |
+
)
|
| 41 |
+
|
| 42 |
+
density_predictor = GenericPredictor(
|
| 43 |
+
PREDICTOR_PATHS["density"],
|
| 44 |
+
"Density"
|
| 45 |
+
)
|
| 46 |
+
|
| 47 |
+
lhv_predictor = GenericPredictor(
|
| 48 |
+
PREDICTOR_PATHS["lhv"],
|
| 49 |
+
"Lower Heating Value"
|
| 50 |
+
)
|
| 51 |
+
|
| 52 |
+
dyn_visc_predictor = GenericPredictor(
|
| 53 |
+
PREDICTOR_PATHS["dynamic_viscosity"],
|
| 54 |
+
"Dynamic Viscosity"
|
| 55 |
+
)
|
| 56 |
+
ysi_predictor = GenericPredictor(
|
| 57 |
+
PREDICTOR_PATHS["ysi"],
|
| 58 |
+
"YSI"
|
| 59 |
+
)
|
| 60 |
+
|
| 61 |
+
# --- Predict ---
|
| 62 |
+
result = {
|
| 63 |
+
"SMILES": smiles,
|
| 64 |
+
"CN": cn_predictor.predict_from_features(X_full)[0],
|
| 65 |
+
"YSI": ysi_predictor.predict_from_features(X_full)[0],
|
| 66 |
+
"BOILING POINT": bp_predictor.predict_from_features(X_full)[0],
|
| 67 |
+
"DENSITY": density_predictor.predict_from_features(X_full)[0],
|
| 68 |
+
"LHV": lhv_predictor.predict_from_features(X_full)[0],
|
| 69 |
+
"DYNAMIC VISCOSITY": dyn_visc_predictor.predict_from_features(X_full)[0]
|
| 70 |
+
}
|
| 71 |
+
|
| 72 |
+
return result
|
| 73 |
+
|
| 74 |
+
|
| 75 |
+
def main():
|
| 76 |
+
config = get_user_config()
|
| 77 |
+
results = run(config)
|
| 78 |
+
display_results(results)
|
| 79 |
+
|
| 80 |
+
|
| 81 |
+
if __name__ == "__main__":
|
| 82 |
+
main()
|
applications/pure_predictor/results.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# applications/1_pure_predictor/results.py
|
| 2 |
+
|
| 3 |
+
import pandas as pd
|
| 4 |
+
|
| 5 |
+
def display_results(result: dict):
|
| 6 |
+
"""
|
| 7 |
+
Display pure-component prediction results.
|
| 8 |
+
"""
|
| 9 |
+
df = pd.DataFrame([result])
|
| 10 |
+
print("\n=== PURE COMPONENT PROPERTY PREDICTION ===\n")
|
| 11 |
+
print(df.to_string(index=False))
|
core/__init__.py
ADDED
|
File without changes
|
core/__pycache__/config.cpython-310.pyc
CHANGED
|
Binary files a/core/__pycache__/config.cpython-310.pyc and b/core/__pycache__/config.cpython-310.pyc differ
|
|
|
core/blending/__init__.py
ADDED
|
File without changes
|
core/config.py
ADDED
|
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, field
|
| 2 |
+
|
| 3 |
+
@dataclass
|
| 4 |
+
class EvolutionConfig:
|
| 5 |
+
target_cn: float = 50.0
|
| 6 |
+
maximize_cn: bool = False
|
| 7 |
+
minimize_ysi: bool = True
|
| 8 |
+
|
| 9 |
+
generations: int = 6
|
| 10 |
+
population_size: int = 100
|
| 11 |
+
mutations_per_parent: int = 5
|
| 12 |
+
survivor_fraction: float = 0.5
|
| 13 |
+
|
| 14 |
+
batch_size: int = 100
|
| 15 |
+
max_offspring_attempts: int = 10
|
| 16 |
+
|
| 17 |
+
# Filters
|
| 18 |
+
filters: dict = field(default_factory=lambda: {
|
| 19 |
+
"bp": (60.0, 250.0),
|
| 20 |
+
"density": (720.0, None),
|
| 21 |
+
"lhv": (30.0, None),
|
| 22 |
+
"dynamic_viscosity": (0.0, 2.0),
|
| 23 |
+
})
|
| 24 |
+
|
| 25 |
+
def cn_objective(self, cn: float) -> float:
|
| 26 |
+
return cn if self.maximize_cn else -abs(cn - self.target_cn)
|
core/data_prep.py
ADDED
|
@@ -0,0 +1,34 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sqlite3
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from sklearn.model_selection import train_test_split
|
| 5 |
+
import os
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__)) # goes from src/ → project root
|
| 8 |
+
DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
|
| 9 |
+
|
| 10 |
+
TARGET_CN = "cn" # Cetane number
|
| 11 |
+
N_FOLDS = 5
|
| 12 |
+
TOP_K = 5
|
| 13 |
+
print("Connecting to SQLite database...")
|
| 14 |
+
conn = sqlite3.connect(DB_PATH)
|
| 15 |
+
|
| 16 |
+
query = """
|
| 17 |
+
SELECT
|
| 18 |
+
F.Fuel_Name,
|
| 19 |
+
F.SMILES,
|
| 20 |
+
T.Standardised_DCN AS cn
|
| 21 |
+
FROM FUEL F
|
| 22 |
+
LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
|
| 23 |
+
ORDER BY F.fuel_id ASC;
|
| 24 |
+
"""
|
| 25 |
+
df = pd.read_sql_query(query, conn)
|
| 26 |
+
conn.close()
|
| 27 |
+
df.dropna(subset=[TARGET_CN, "SMILES"], inplace=True)
|
| 28 |
+
|
| 29 |
+
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
|
| 30 |
+
print(df.head())
|
| 31 |
+
print(df.columns)
|
| 32 |
+
|
| 33 |
+
def load_data():
|
| 34 |
+
return df
|
core/evolution/__init__.py
ADDED
|
File without changes
|
core/evolution/__pycache__/evolution.cpython-310.pyc
CHANGED
|
Binary files a/core/evolution/__pycache__/evolution.cpython-310.pyc and b/core/evolution/__pycache__/evolution.cpython-310.pyc differ
|
|
|
core/evolution/evolution.py
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .population import Population
|
| 2 |
+
from .molecule import Molecule
|
| 3 |
+
from core.predictors.pure_component.property_predictor import PropertyPredictor
|
| 4 |
+
from core.config import EvolutionConfig
|
| 5 |
+
from crem.crem import mutate_mol
|
| 6 |
+
from rdkit import Chem
|
| 7 |
+
import pandas as pd
|
| 8 |
+
import numpy as np
|
| 9 |
+
import random
|
| 10 |
+
from typing import List, Tuple
|
| 11 |
+
from core.data_prep import df # Initial dataset for sampling
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
class MolecularEvolution:
|
| 15 |
+
"""Main evolutionary algorithm coordinator."""
|
| 16 |
+
BASE_DIR = Path(__file__).resolve().parent.parent.parent
|
| 17 |
+
REP_DB_PATH = BASE_DIR / "data" / "fragments" / "diesel_fragments.db"
|
| 18 |
+
|
| 19 |
+
def __init__(self, config: EvolutionConfig):
|
| 20 |
+
self.config = config
|
| 21 |
+
self.predictor = PropertyPredictor(config)
|
| 22 |
+
self.population = Population(config)
|
| 23 |
+
|
| 24 |
+
def _mutate_molecule(self, mol: Chem.Mol) -> List[str]:
|
| 25 |
+
"""Generate mutations for a molecule using CREM."""
|
| 26 |
+
try:
|
| 27 |
+
mutants = list(mutate_mol(
|
| 28 |
+
mol,
|
| 29 |
+
db_name=str(self.REP_DB_PATH),
|
| 30 |
+
max_size=2,
|
| 31 |
+
return_mol=False
|
| 32 |
+
))
|
| 33 |
+
return [m for m in mutants if m and m not in self.population.seen_smiles]
|
| 34 |
+
except Exception:
|
| 35 |
+
return []
|
| 36 |
+
|
| 37 |
+
def _create_molecules(self, smiles_list: List[str]) -> List[Molecule]:
|
| 38 |
+
"""Create Molecule objects from SMILES with predictions (OPTIMIZED)."""
|
| 39 |
+
if not smiles_list:
|
| 40 |
+
return []
|
| 41 |
+
|
| 42 |
+
# OPTIMIZATION: Single featurization + all predictions
|
| 43 |
+
predictions = self.predictor.predict_all_properties(smiles_list)
|
| 44 |
+
|
| 45 |
+
molecules = []
|
| 46 |
+
for i, smiles in enumerate(smiles_list):
|
| 47 |
+
# Extract predictions for this molecule
|
| 48 |
+
props = {k: v[i] for k, v in predictions.items()}
|
| 49 |
+
|
| 50 |
+
# Validate required properties
|
| 51 |
+
if props.get('cn') is None:
|
| 52 |
+
continue
|
| 53 |
+
if self.config.minimize_ysi and props.get('ysi') is None:
|
| 54 |
+
continue
|
| 55 |
+
|
| 56 |
+
# Validate filtered properties
|
| 57 |
+
if not all(self.predictor.is_valid(k, props.get(k)) for k in ['bp', 'density', 'lhv', 'dynamic_viscosity']):
|
| 58 |
+
continue
|
| 59 |
+
|
| 60 |
+
molecules.append(Molecule(
|
| 61 |
+
smiles=smiles,
|
| 62 |
+
cn=props['cn'],
|
| 63 |
+
cn_error=abs(props['cn'] - self.config.target_cn),
|
| 64 |
+
cn_score=props['cn'], # For maximize mode
|
| 65 |
+
bp=props.get('bp'),
|
| 66 |
+
ysi=props.get('ysi'),
|
| 67 |
+
density=props.get('density'),
|
| 68 |
+
lhv=props.get('lhv'),
|
| 69 |
+
dynamic_viscosity=props.get('dynamic_viscosity')
|
| 70 |
+
))
|
| 71 |
+
|
| 72 |
+
return molecules
|
| 73 |
+
|
| 74 |
+
def initialize_population(self, initial_smiles: List[str]) -> int:
|
| 75 |
+
"""Initialize the population from initial SMILES."""
|
| 76 |
+
print("Predicting properties for initial population...")
|
| 77 |
+
molecules = self._create_molecules(initial_smiles)
|
| 78 |
+
return self.population.add_molecules(molecules)
|
| 79 |
+
|
| 80 |
+
def _log_generation_stats(self, generation: int):
|
| 81 |
+
"""Log statistics for the current generation."""
|
| 82 |
+
mols = self.population.molecules
|
| 83 |
+
|
| 84 |
+
if self.config.maximize_cn:
|
| 85 |
+
best_cn = max(mols, key=lambda m: m.cn)
|
| 86 |
+
avg_cn = np.mean([m.cn for m in mols])
|
| 87 |
+
|
| 88 |
+
print_msg = (f"Gen {generation}/{self.config.generations} | "
|
| 89 |
+
f"Pop {len(mols)} | "
|
| 90 |
+
f"Best CN: {best_cn.cn:.3f} | "
|
| 91 |
+
f"Avg CN: {avg_cn:.3f}")
|
| 92 |
+
else:
|
| 93 |
+
best_cn = min(mols, key=lambda m: m.cn_error)
|
| 94 |
+
avg_cn_err = np.mean([m.cn_error for m in mols])
|
| 95 |
+
|
| 96 |
+
print_msg = (f"Gen {generation}/{self.config.generations} | "
|
| 97 |
+
f"Pop {len(mols)} | "
|
| 98 |
+
f"Best CN err: {best_cn.cn_error:.3f} | "
|
| 99 |
+
f"Avg CN err: {avg_cn_err:.3f}")
|
| 100 |
+
|
| 101 |
+
if self.config.minimize_ysi:
|
| 102 |
+
front = self.population.pareto_front()
|
| 103 |
+
best_ysi = min(mols, key=lambda m: m.ysi)
|
| 104 |
+
avg_ysi = np.mean([m.ysi for m in mols])
|
| 105 |
+
|
| 106 |
+
print_msg += (f" | Best YSI: {best_ysi.ysi:.3f} | "
|
| 107 |
+
f"Avg YSI: {avg_ysi:.3f} | "
|
| 108 |
+
f"Pareto: {len(front)}")
|
| 109 |
+
|
| 110 |
+
print(print_msg)
|
| 111 |
+
|
| 112 |
+
def _generate_offspring(self, survivors: List[Molecule]) -> List[Molecule]:
|
| 113 |
+
"""Generates offspring from survivors."""
|
| 114 |
+
target_count = self.config.population_size - len(survivors)
|
| 115 |
+
max_attempts = target_count * self.config.max_offspring_attempts
|
| 116 |
+
|
| 117 |
+
all_children = []
|
| 118 |
+
new_molecules = []
|
| 119 |
+
|
| 120 |
+
print(f" → Generating offspring (target: {target_count})...")
|
| 121 |
+
|
| 122 |
+
for attempt in range(max_attempts):
|
| 123 |
+
if len(new_molecules) >= target_count:
|
| 124 |
+
break
|
| 125 |
+
|
| 126 |
+
# Generate mutations
|
| 127 |
+
parent = random.choice(survivors)
|
| 128 |
+
mol = Chem.MolFromSmiles(parent.smiles)
|
| 129 |
+
if mol is None:
|
| 130 |
+
continue
|
| 131 |
+
|
| 132 |
+
children = self._mutate_molecule(mol)
|
| 133 |
+
all_children.extend(children[:self.config.mutations_per_parent])
|
| 134 |
+
|
| 135 |
+
# Process in larger batches (single featurization per batch)
|
| 136 |
+
if len(all_children) >= self.config.batch_size:
|
| 137 |
+
print(f" → Evaluating batch of {len(all_children)} (featurizing once)...")
|
| 138 |
+
new_molecules.extend(self._create_molecules(all_children))
|
| 139 |
+
all_children = []
|
| 140 |
+
|
| 141 |
+
# Process remaining children
|
| 142 |
+
if all_children:
|
| 143 |
+
print(f" → Evaluating final batch of {len(all_children)}...")
|
| 144 |
+
new_molecules.extend(self._create_molecules(all_children))
|
| 145 |
+
|
| 146 |
+
print(f" ✓ Generated {len(new_molecules)} valid offspring")
|
| 147 |
+
return new_molecules
|
| 148 |
+
|
| 149 |
+
def _run_evolution_loop(self):
|
| 150 |
+
"""Run the main evolution loop."""
|
| 151 |
+
for gen in range(1, self.config.generations + 1):
|
| 152 |
+
self._log_generation_stats(gen)
|
| 153 |
+
|
| 154 |
+
survivors = self.population.get_survivors()
|
| 155 |
+
offspring = self._generate_offspring(survivors)
|
| 156 |
+
|
| 157 |
+
# Create new population
|
| 158 |
+
new_pop = Population(self.config)
|
| 159 |
+
new_pop.add_molecules(survivors + offspring)
|
| 160 |
+
self.population = new_pop
|
| 161 |
+
|
| 162 |
+
def _generate_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 163 |
+
"""Generate final results DataFrames."""
|
| 164 |
+
final_df = self.population.to_dataframe()
|
| 165 |
+
|
| 166 |
+
# Apply different filtering based on mode
|
| 167 |
+
if self.config.maximize_cn:
|
| 168 |
+
if self.config.minimize_ysi and "ysi" in final_df.columns:
|
| 169 |
+
# Maximize CN + minimize YSI: keep high CN, low YSI
|
| 170 |
+
final_df = final_df[
|
| 171 |
+
(final_df["cn"] > 50) &
|
| 172 |
+
(final_df["ysi"] < 50)
|
| 173 |
+
].sort_values(["cn", "ysi"], ascending=[False, True])
|
| 174 |
+
else:
|
| 175 |
+
# Maximize CN only: just keep high CN
|
| 176 |
+
final_df = final_df[final_df["cn"] > 50].sort_values("cn", ascending=False)
|
| 177 |
+
else:
|
| 178 |
+
if self.config.minimize_ysi and "ysi" in final_df.columns:
|
| 179 |
+
# Target CN + minimize YSI: keep low error, low YSI
|
| 180 |
+
final_df = final_df[
|
| 181 |
+
(final_df["cn_error"] < 5) &
|
| 182 |
+
(final_df["ysi"] < 50)
|
| 183 |
+
].sort_values(["cn_error", "ysi"], ascending=True)
|
| 184 |
+
else:
|
| 185 |
+
# Target CN only: just keep low error
|
| 186 |
+
final_df = final_df[final_df["cn_error"] < 5].sort_values("cn_error", ascending=True)
|
| 187 |
+
|
| 188 |
+
# Overwrite rank safely
|
| 189 |
+
final_df["rank"] = range(1, len(final_df) + 1)
|
| 190 |
+
|
| 191 |
+
if self.config.minimize_ysi:
|
| 192 |
+
pareto_mols = self.population.pareto_front()
|
| 193 |
+
pareto_df = pd.DataFrame([m.to_dict() for m in pareto_mols])
|
| 194 |
+
|
| 195 |
+
if not pareto_df.empty:
|
| 196 |
+
if self.config.maximize_cn:
|
| 197 |
+
pareto_df = pareto_df[
|
| 198 |
+
(pareto_df['cn'] > 50) & (pareto_df['ysi'] < 50)
|
| 199 |
+
].sort_values(["cn", "ysi"], ascending=[False, True])
|
| 200 |
+
else:
|
| 201 |
+
pareto_df = pareto_df[
|
| 202 |
+
(pareto_df['cn_error'] < 5) & (pareto_df['ysi'] < 50)
|
| 203 |
+
].sort_values(["cn_error", "ysi"], ascending=True)
|
| 204 |
+
|
| 205 |
+
pareto_df.insert(0, 'rank', range(1, len(pareto_df) + 1))
|
| 206 |
+
else:
|
| 207 |
+
pareto_df = pd.DataFrame()
|
| 208 |
+
|
| 209 |
+
return final_df, pareto_df
|
| 210 |
+
|
| 211 |
+
|
| 212 |
+
def evolve(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
|
| 213 |
+
"""Run the evolutionary algorithm."""
|
| 214 |
+
# Initialize
|
| 215 |
+
df_bins = pd.qcut(df["cn"], q=30)
|
| 216 |
+
initial_smiles = (
|
| 217 |
+
df.groupby(df_bins, observed=False)
|
| 218 |
+
.apply(lambda x: x.sample(20, random_state=42))
|
| 219 |
+
.reset_index(drop=True)["SMILES"]
|
| 220 |
+
.tolist()
|
| 221 |
+
)
|
| 222 |
+
init_count = self.initialize_population(initial_smiles)
|
| 223 |
+
|
| 224 |
+
if init_count == 0:
|
| 225 |
+
print("No valid initial molecules")
|
| 226 |
+
return pd.DataFrame(), pd.DataFrame()
|
| 227 |
+
|
| 228 |
+
print(f"✓ Initial population size: {init_count}\n")
|
| 229 |
+
|
| 230 |
+
# Evolution
|
| 231 |
+
self._run_evolution_loop()
|
| 232 |
+
|
| 233 |
+
# Results
|
| 234 |
+
return self._generate_results()
|
core/evolution/molecule.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass, asdict
|
| 2 |
+
from typing import Optional, Dict
|
| 3 |
+
|
| 4 |
+
@dataclass
|
| 5 |
+
class Molecule:
|
| 6 |
+
"""Represents a molecule with its properties."""
|
| 7 |
+
smiles: str
|
| 8 |
+
cn: float
|
| 9 |
+
cn_error: float
|
| 10 |
+
cn_score: float = 0.0 # For maximize mode (higher is better)
|
| 11 |
+
bp: Optional[float] = None
|
| 12 |
+
ysi: Optional[float] = None
|
| 13 |
+
density: Optional[float] = None
|
| 14 |
+
lhv: Optional[float] = None
|
| 15 |
+
dynamic_viscosity: Optional[float] = None
|
| 16 |
+
|
| 17 |
+
def dominates(self, other: 'Molecule', maximize_cn: bool = False) -> bool:
|
| 18 |
+
"""Check if this molecule Pareto-dominates another."""
|
| 19 |
+
if maximize_cn:
|
| 20 |
+
# For maximize mode: higher CN is better
|
| 21 |
+
better_cn = self.cn >= other.cn
|
| 22 |
+
strictly_better_cn = self.cn > other.cn
|
| 23 |
+
else:
|
| 24 |
+
better_cn = self.cn_error <= other.cn_error
|
| 25 |
+
strictly_better_cn = self.cn_error < other.cn_error
|
| 26 |
+
better_ysi = self.ysi <= other.ysi if self.ysi is not None else True
|
| 27 |
+
strictly_better_ysi = self.ysi < other.ysi if self.ysi is not None else False
|
| 28 |
+
strictly_better = strictly_better_cn or strictly_better_ysi
|
| 29 |
+
return better_cn and better_ysi and strictly_better
|
| 30 |
+
|
| 31 |
+
def to_dict(self) -> Dict:
|
| 32 |
+
"""Convert to dictionary for DataFrame creation."""
|
| 33 |
+
return {k: v for k, v in asdict(self).items() if v is not None}
|
core/evolution/population.py
ADDED
|
@@ -0,0 +1,86 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from typing import List
|
| 2 |
+
from core.config import EvolutionConfig
|
| 3 |
+
from .molecule import Molecule
|
| 4 |
+
import pandas as pd
|
| 5 |
+
|
| 6 |
+
class Population:
|
| 7 |
+
"""Manages the population of molecules."""
|
| 8 |
+
def __init__(self, config: EvolutionConfig):
|
| 9 |
+
self.config = config
|
| 10 |
+
self.molecules: List[Molecule] = []
|
| 11 |
+
self.seen_smiles: set = set()
|
| 12 |
+
|
| 13 |
+
def add_molecule(self, mol: Molecule) -> bool:
|
| 14 |
+
"""Add a molecule if it's not already in the population."""
|
| 15 |
+
if mol.smiles in self.seen_smiles:
|
| 16 |
+
return False
|
| 17 |
+
self.molecules.append(mol)
|
| 18 |
+
self.seen_smiles.add(mol.smiles)
|
| 19 |
+
return True
|
| 20 |
+
|
| 21 |
+
def add_molecules(self, molecules: List[Molecule]) -> int:
|
| 22 |
+
"""Add multiple molecules, return count added."""
|
| 23 |
+
return sum(self.add_molecule(mol) for mol in molecules)
|
| 24 |
+
|
| 25 |
+
def pareto_front(self) -> List[Molecule]:
|
| 26 |
+
"""Extract the Pareto front from the population."""
|
| 27 |
+
if not self.config.minimize_ysi:
|
| 28 |
+
return []
|
| 29 |
+
|
| 30 |
+
return [
|
| 31 |
+
mol for mol in self.molecules
|
| 32 |
+
if not any(other.dominates(mol, self.config.maximize_cn)
|
| 33 |
+
for other in self.molecules if other is not mol)
|
| 34 |
+
]
|
| 35 |
+
|
| 36 |
+
def get_survivors(self) -> List[Molecule]:
|
| 37 |
+
"""Select survivors for the next generation."""
|
| 38 |
+
target_size = int(self.config.population_size * self.config.survivor_fraction)
|
| 39 |
+
|
| 40 |
+
if self.config.minimize_ysi:
|
| 41 |
+
survivors = self.pareto_front()
|
| 42 |
+
|
| 43 |
+
sort_key = lambda m: (
|
| 44 |
+
-self.config.cn_objective(m.cn), # higher objective = better
|
| 45 |
+
m.ysi
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
|
| 49 |
+
if len(survivors) > target_size:
|
| 50 |
+
survivors = sorted(survivors, key=sort_key)[:target_size]
|
| 51 |
+
elif len(survivors) < target_size:
|
| 52 |
+
remainder = [m for m in self.molecules if m not in survivors]
|
| 53 |
+
remainder = sorted(remainder, key=sort_key)
|
| 54 |
+
survivors.extend(remainder[:target_size - len(survivors)])
|
| 55 |
+
else:
|
| 56 |
+
# Single objective mode
|
| 57 |
+
survivors = sorted(
|
| 58 |
+
self.molecules,
|
| 59 |
+
key=lambda m: self.config.cn_objective(m.cn),
|
| 60 |
+
reverse=True
|
| 61 |
+
)[:target_size]
|
| 62 |
+
|
| 63 |
+
return survivors
|
| 64 |
+
|
| 65 |
+
def to_dataframe(self) -> pd.DataFrame:
|
| 66 |
+
"""Convert population to DataFrame."""
|
| 67 |
+
df = pd.DataFrame([m.to_dict() for m in self.molecules])
|
| 68 |
+
|
| 69 |
+
if self.config.maximize_cn:
|
| 70 |
+
if self.config.minimize_ysi:
|
| 71 |
+
sort_cols = ["cn", "ysi"]
|
| 72 |
+
ascending = [False, True] # Descending CN, ascending YSI
|
| 73 |
+
else:
|
| 74 |
+
sort_cols = ["cn"]
|
| 75 |
+
ascending = False
|
| 76 |
+
else:
|
| 77 |
+
if self.config.minimize_ysi:
|
| 78 |
+
sort_cols = ["cn_error", "ysi"]
|
| 79 |
+
ascending = True
|
| 80 |
+
else:
|
| 81 |
+
sort_cols = ["cn_error"]
|
| 82 |
+
ascending = True
|
| 83 |
+
|
| 84 |
+
df = df.sort_values(sort_cols, ascending=ascending)
|
| 85 |
+
df.insert(0, 'rank', range(1, len(df) + 1))
|
| 86 |
+
return df
|
core/predictors/__init__.py
ADDED
|
File without changes
|
core/predictors/mixture/__init__.py
ADDED
|
File without changes
|
core/predictors/pure_component/__pycache__/generic.cpython-310.pyc
CHANGED
|
Binary files a/core/predictors/pure_component/__pycache__/generic.cpython-310.pyc and b/core/predictors/pure_component/__pycache__/generic.cpython-310.pyc differ
|
|
|
core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc
CHANGED
|
Binary files a/core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc and b/core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc differ
|
|
|
core/predictors/pure_component/generic.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import joblib
|
| 2 |
+
import numpy as np
|
| 3 |
+
|
| 4 |
+
from pathlib import Path
|
| 5 |
+
|
| 6 |
+
# Make FeatureSelector discoverable for joblib
|
| 7 |
+
import sys
|
| 8 |
+
from core.shared_features import FeatureSelector
|
| 9 |
+
|
| 10 |
+
# --- FIX FOR JOBLIB / PICKLE ---
|
| 11 |
+
# Models were trained with FeatureSelector in __main__
|
| 12 |
+
# Ensure pickle can resolve it in all contexts (pytest, HF, Docker)
|
| 13 |
+
main_module = sys.modules.get("__main__")
|
| 14 |
+
if main_module is not None and not hasattr(main_module, "FeatureSelector"):
|
| 15 |
+
setattr(main_module, "FeatureSelector", FeatureSelector)
|
| 16 |
+
|
| 17 |
+
class GenericPredictor:
|
| 18 |
+
"""Generic predictor that works for any property model."""
|
| 19 |
+
|
| 20 |
+
def __init__(self, model_dir: Path, property_name: str):
|
| 21 |
+
"""
|
| 22 |
+
Initialize predictor from a model directory.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
model_dir: Path to the model directory containing artifacts/
|
| 26 |
+
property_name: Name of the property (for display purposes)
|
| 27 |
+
"""
|
| 28 |
+
print(f"Loading {property_name} Predictor...")
|
| 29 |
+
|
| 30 |
+
model_path = model_dir / "model.joblib"
|
| 31 |
+
selector_path = model_dir / "selector.joblib"
|
| 32 |
+
|
| 33 |
+
# Load artifacts
|
| 34 |
+
self.model = joblib.load(model_path)
|
| 35 |
+
self.selector = FeatureSelector.load(selector_path)
|
| 36 |
+
self.property_name = property_name
|
| 37 |
+
|
| 38 |
+
print(f"✓ {property_name} Predictor ready!")
|
| 39 |
+
|
| 40 |
+
def predict_from_features(self, X_full):
|
| 41 |
+
"""Predict from pre-computed features."""
|
| 42 |
+
if X_full is None or len(X_full) == 0:
|
| 43 |
+
return []
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
X_selected = self.selector.transform(X_full)
|
| 47 |
+
predictions = self.model.predict(X_selected)
|
| 48 |
+
return predictions.tolist()
|
| 49 |
+
except Exception as e:
|
| 50 |
+
print(f"⚠ Warning: {self.property_name} prediction failed: {e}")
|
| 51 |
+
return [None] * len(X_full)
|
core/predictors/pure_component/hf_models.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from pathlib import Path
|
| 2 |
+
from huggingface_hub import snapshot_download
|
| 3 |
+
|
| 4 |
+
HF_MODELS = {
|
| 5 |
+
"cn": "SalZa2004/Cetane_Number_Predictor",
|
| 6 |
+
"ysi": "SalZa2004/YSI_Predictor",
|
| 7 |
+
"bp": "SalZa2004/Boiling_Point_Predictor",
|
| 8 |
+
"density": "SalZa2004/Density_Predictor",
|
| 9 |
+
"lhv": "SalZa2004/LHV_Predictor",
|
| 10 |
+
"dynamic_viscosity": "SalZa2004/Dynamic_Viscosity_Predictor",
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
def load_models():
|
| 14 |
+
return {
|
| 15 |
+
k: Path(snapshot_download(repo_id=v, repo_type="model"))
|
| 16 |
+
for k, v in HF_MODELS.items()
|
| 17 |
+
}
|
core/predictors/pure_component/property_predictor.py
ADDED
|
@@ -0,0 +1,77 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from .generic import GenericPredictor
|
| 3 |
+
from core.shared_features import featurize_df
|
| 4 |
+
from core.config import EvolutionConfig
|
| 5 |
+
from typing import List, Dict, Optional, Tuple, Callable
|
| 6 |
+
from .hf_models import load_models
|
| 7 |
+
|
| 8 |
+
PREDICTOR_PATHS = load_models()
|
| 9 |
+
class PropertyPredictor:
|
| 10 |
+
"""Handles batch prediction for all molecular properties."""
|
| 11 |
+
|
| 12 |
+
def __init__(self, config: EvolutionConfig):
|
| 13 |
+
self.config = config
|
| 14 |
+
|
| 15 |
+
# Initialize only the predictors we need
|
| 16 |
+
self.predictors = {}
|
| 17 |
+
|
| 18 |
+
# Always need CN predictor
|
| 19 |
+
self.predictors['cn'] = GenericPredictor(
|
| 20 |
+
PREDICTOR_PATHS['cn'],
|
| 21 |
+
'Cetane Number'
|
| 22 |
+
)
|
| 23 |
+
|
| 24 |
+
# Conditional predictors
|
| 25 |
+
if config.minimize_ysi:
|
| 26 |
+
self.predictors['ysi'] = GenericPredictor(
|
| 27 |
+
PREDICTOR_PATHS['ysi'],
|
| 28 |
+
'YSI'
|
| 29 |
+
)
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# Define validation rules
|
| 33 |
+
self.validators = {
|
| 34 |
+
'bp': lambda v: self.config.min_bp <= v <= self.config.max_bp,
|
| 35 |
+
'density': lambda v: v > self.config.min_density,
|
| 36 |
+
'lhv': lambda v: v > self.config.min_lhv,
|
| 37 |
+
'dynamic_viscosity': lambda v: self.config.min_dynamic_viscosity < v <= self.config.max_dynamic_viscosity
|
| 38 |
+
}
|
| 39 |
+
|
| 40 |
+
def _safe_predict(self, predictions: List) -> List[Optional[float]]:
|
| 41 |
+
"""Safely convert predictions, handling None/NaN/inf values."""
|
| 42 |
+
return [
|
| 43 |
+
float(pred) if pred is not None and np.isfinite(pred) else None
|
| 44 |
+
for pred in predictions
|
| 45 |
+
]
|
| 46 |
+
|
| 47 |
+
def predict_all_properties(self, smiles_list: List[str]) -> Dict[str, List[Optional[float]]]:
|
| 48 |
+
"""
|
| 49 |
+
Predict all properties for a batch of SMILES.
|
| 50 |
+
Featurizes ONCE and reuses features for all predictors.
|
| 51 |
+
"""
|
| 52 |
+
if not smiles_list:
|
| 53 |
+
return {prop: [] for prop in self.predictors.keys()}
|
| 54 |
+
|
| 55 |
+
# OPTIMIZATION: Featurize only once per batch
|
| 56 |
+
X_full = featurize_df(smiles_list, return_df=False)
|
| 57 |
+
|
| 58 |
+
if X_full is None:
|
| 59 |
+
return {prop: [None] * len(smiles_list) for prop in self.predictors.keys()}
|
| 60 |
+
|
| 61 |
+
# Predict all properties using the same features
|
| 62 |
+
results = {}
|
| 63 |
+
for prop_name, predictor in self.predictors.items():
|
| 64 |
+
predictions = predictor.predict_from_features(X_full)
|
| 65 |
+
results[prop_name] = self._safe_predict(predictions)
|
| 66 |
+
|
| 67 |
+
return results
|
| 68 |
+
|
| 69 |
+
def is_valid(self, name, value):
|
| 70 |
+
if value is None or name not in self.config.filters:
|
| 71 |
+
return True
|
| 72 |
+
lo, hi = self.config.filters[name]
|
| 73 |
+
if lo is not None and value < lo:
|
| 74 |
+
return False
|
| 75 |
+
if hi is not None and value > hi:
|
| 76 |
+
return False
|
| 77 |
+
return True
|
core/shared_features.py
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import sqlite3
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
from sklearn.model_selection import train_test_split
|
| 6 |
+
|
| 7 |
+
PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
|
| 8 |
+
DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
|
| 9 |
+
|
| 10 |
+
def load_raw_data():
|
| 11 |
+
"""Load raw data from database."""
|
| 12 |
+
print("Connecting to SQLite database...")
|
| 13 |
+
conn = sqlite3.connect(DB_PATH)
|
| 14 |
+
|
| 15 |
+
query = """
|
| 16 |
+
SELECT
|
| 17 |
+
F.Fuel_Name,
|
| 18 |
+
F.SMILES,
|
| 19 |
+
T.Standardised_DCN AS cn
|
| 20 |
+
FROM FUEL F
|
| 21 |
+
LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
|
| 22 |
+
"""
|
| 23 |
+
df = pd.read_sql_query(query, conn)
|
| 24 |
+
conn.close()
|
| 25 |
+
|
| 26 |
+
# Clean data
|
| 27 |
+
df.dropna(subset=["cn", "SMILES"], inplace=True)
|
| 28 |
+
|
| 29 |
+
return df
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
# ============================================================================
|
| 33 |
+
# 2. FEATURIZATION MODULE
|
| 34 |
+
# ============================================================================
|
| 35 |
+
from rdkit import Chem
|
| 36 |
+
from rdkit.Chem import Descriptors, rdFingerprintGenerator
|
| 37 |
+
from tqdm import tqdm
|
| 38 |
+
|
| 39 |
+
# Get descriptor names globally
|
| 40 |
+
DESCRIPTOR_NAMES = [d[0] for d in Descriptors._descList]
|
| 41 |
+
desc_functions = [d[1] for d in Descriptors._descList]
|
| 42 |
+
|
| 43 |
+
def morgan_fp_from_mol(mol, radius=2, n_bits=2048):
|
| 44 |
+
"""Generate Morgan fingerprint."""
|
| 45 |
+
fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
|
| 46 |
+
fp = fpgen.GetFingerprint(mol)
|
| 47 |
+
arr = np.array(list(fp.ToBitString()), dtype=int)
|
| 48 |
+
return arr
|
| 49 |
+
|
| 50 |
+
def physchem_desc_from_mol(mol):
|
| 51 |
+
"""Calculate physicochemical descriptors."""
|
| 52 |
+
try:
|
| 53 |
+
desc = np.array([fn(mol) for fn in desc_functions], dtype=np.float32)
|
| 54 |
+
desc = np.nan_to_num(desc, nan=0.0, posinf=0.0, neginf=0.0)
|
| 55 |
+
return desc
|
| 56 |
+
except:
|
| 57 |
+
return None
|
| 58 |
+
|
| 59 |
+
def featurize(smiles):
|
| 60 |
+
"""Convert SMILES to feature vector."""
|
| 61 |
+
mol = Chem.MolFromSmiles(smiles)
|
| 62 |
+
if mol is None:
|
| 63 |
+
return None
|
| 64 |
+
|
| 65 |
+
fp = morgan_fp_from_mol(mol)
|
| 66 |
+
desc = physchem_desc_from_mol(mol)
|
| 67 |
+
|
| 68 |
+
if fp is None or desc is None:
|
| 69 |
+
return None
|
| 70 |
+
|
| 71 |
+
return np.hstack([fp, desc])
|
| 72 |
+
|
| 73 |
+
def featurize_df(df, smiles_col="SMILES", return_df=True):
|
| 74 |
+
"""
|
| 75 |
+
Featurize a DataFrame or list of SMILES (vectorized for speed).
|
| 76 |
+
"""
|
| 77 |
+
# Handle different input types
|
| 78 |
+
if isinstance(df, (list, np.ndarray)):
|
| 79 |
+
df = pd.DataFrame({smiles_col: df})
|
| 80 |
+
elif isinstance(df, pd.Series):
|
| 81 |
+
df = pd.DataFrame({smiles_col: df})
|
| 82 |
+
|
| 83 |
+
# Convert all SMILES to molecules in batch
|
| 84 |
+
mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_col]]
|
| 85 |
+
|
| 86 |
+
features = []
|
| 87 |
+
valid_indices = []
|
| 88 |
+
|
| 89 |
+
# Process valid molecules
|
| 90 |
+
for i, mol in enumerate(tqdm(mols, desc="Featurizing")):
|
| 91 |
+
if mol is None:
|
| 92 |
+
continue
|
| 93 |
+
|
| 94 |
+
try:
|
| 95 |
+
fp = morgan_fp_from_mol(mol)
|
| 96 |
+
desc = physchem_desc_from_mol(mol)
|
| 97 |
+
|
| 98 |
+
if fp is not None and desc is not None:
|
| 99 |
+
features.append(np.hstack([fp, desc]))
|
| 100 |
+
valid_indices.append(i)
|
| 101 |
+
except:
|
| 102 |
+
continue
|
| 103 |
+
|
| 104 |
+
if len(features) == 0:
|
| 105 |
+
return (None, None) if return_df else None
|
| 106 |
+
|
| 107 |
+
X = np.vstack(features)
|
| 108 |
+
|
| 109 |
+
if return_df:
|
| 110 |
+
df_valid = df.iloc[valid_indices].reset_index(drop=True)
|
| 111 |
+
return X, df_valid
|
| 112 |
+
else:
|
| 113 |
+
return X
|
| 114 |
+
|
| 115 |
+
|
| 116 |
+
# ============================================================================
|
| 117 |
+
# 3. FEATURE SELECTOR CLASS
|
| 118 |
+
# ============================================================================
|
| 119 |
+
import joblib
|
| 120 |
+
|
| 121 |
+
class FeatureSelector:
|
| 122 |
+
"""Feature selection pipeline that can be saved and reused."""
|
| 123 |
+
|
| 124 |
+
def __init__(self, n_morgan=2048, corr_threshold=0.95, top_k=300):
|
| 125 |
+
self.n_morgan = n_morgan
|
| 126 |
+
self.corr_threshold = corr_threshold
|
| 127 |
+
self.top_k = top_k
|
| 128 |
+
|
| 129 |
+
# Filled during fit()
|
| 130 |
+
self.corr_cols_to_drop = None
|
| 131 |
+
self.selected_indices = None
|
| 132 |
+
self.is_fitted = False
|
| 133 |
+
|
| 134 |
+
def fit(self, X, y):
|
| 135 |
+
"""Fit the feature selector on training data."""
|
| 136 |
+
print("\n" + "="*70)
|
| 137 |
+
print("FITTING FEATURE SELECTOR")
|
| 138 |
+
print("="*70)
|
| 139 |
+
|
| 140 |
+
# Step 1: Split Morgan and descriptors
|
| 141 |
+
X_mfp = X[:, :self.n_morgan]
|
| 142 |
+
X_desc = X[:, self.n_morgan:]
|
| 143 |
+
|
| 144 |
+
print(f"Morgan fingerprints: {X_mfp.shape[1]}")
|
| 145 |
+
print(f"Descriptors: {X_desc.shape[1]}")
|
| 146 |
+
|
| 147 |
+
# Step 2: Remove correlated descriptors
|
| 148 |
+
desc_df = pd.DataFrame(X_desc)
|
| 149 |
+
corr_matrix = desc_df.corr().abs()
|
| 150 |
+
upper = corr_matrix.where(
|
| 151 |
+
np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
|
| 152 |
+
)
|
| 153 |
+
|
| 154 |
+
self.corr_cols_to_drop = [
|
| 155 |
+
col for col in upper.columns if any(upper[col] > self.corr_threshold)
|
| 156 |
+
]
|
| 157 |
+
|
| 158 |
+
print(f"Correlated descriptors removed: {len(self.corr_cols_to_drop)}")
|
| 159 |
+
|
| 160 |
+
desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
|
| 161 |
+
X_corr = np.hstack([X_mfp, desc_filtered])
|
| 162 |
+
|
| 163 |
+
print(f"Features after correlation filter: {X_corr.shape[1]}")
|
| 164 |
+
|
| 165 |
+
# Step 3: Feature importance selection
|
| 166 |
+
from sklearn.ensemble import ExtraTreesRegressor
|
| 167 |
+
|
| 168 |
+
print("Running feature importance selection...")
|
| 169 |
+
model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
|
| 170 |
+
model.fit(X_corr, y)
|
| 171 |
+
|
| 172 |
+
importances = model.feature_importances_
|
| 173 |
+
indices = np.argsort(importances)[::-1]
|
| 174 |
+
|
| 175 |
+
self.selected_indices = indices[:self.top_k]
|
| 176 |
+
|
| 177 |
+
print(f"Final selected features: {len(self.selected_indices)}")
|
| 178 |
+
|
| 179 |
+
self.is_fitted = True
|
| 180 |
+
return self
|
| 181 |
+
|
| 182 |
+
def transform(self, X):
|
| 183 |
+
"""Apply the fitted feature selection to new data."""
|
| 184 |
+
if not self.is_fitted:
|
| 185 |
+
raise RuntimeError("FeatureSelector must be fitted before transform!")
|
| 186 |
+
|
| 187 |
+
# Step 1: Split Morgan and descriptors
|
| 188 |
+
X_mfp = X[:, :self.n_morgan]
|
| 189 |
+
X_desc = X[:, self.n_morgan:]
|
| 190 |
+
|
| 191 |
+
# Step 2: Remove same correlated descriptors
|
| 192 |
+
desc_df = pd.DataFrame(X_desc)
|
| 193 |
+
desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
|
| 194 |
+
X_corr = np.hstack([X_mfp, desc_filtered])
|
| 195 |
+
|
| 196 |
+
# Step 3: Select same important features
|
| 197 |
+
X_selected = X_corr[:, self.selected_indices]
|
| 198 |
+
|
| 199 |
+
return X_selected
|
| 200 |
+
|
| 201 |
+
def fit_transform(self, X, y):
|
| 202 |
+
"""Fit and transform in one step."""
|
| 203 |
+
return self.fit(X, y).transform(X)
|
| 204 |
+
|
| 205 |
+
def save(self, filepath='feature_selector.joblib'):
|
| 206 |
+
"""Save the fitted selector."""
|
| 207 |
+
if not self.is_fitted:
|
| 208 |
+
raise RuntimeError("Cannot save unfitted selector!")
|
| 209 |
+
|
| 210 |
+
# Create directory if it doesn't exist
|
| 211 |
+
os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
|
| 212 |
+
|
| 213 |
+
joblib.dump(self, filepath)
|
| 214 |
+
print(f"✓ Feature selector saved to {filepath}")
|
| 215 |
+
|
| 216 |
+
@staticmethod
|
| 217 |
+
def load(filepath='feature_selector.joblib'):
|
| 218 |
+
"""Load a fitted selector."""
|
| 219 |
+
selector = joblib.load(filepath)
|
| 220 |
+
if not selector.is_fitted:
|
| 221 |
+
raise RuntimeError("Loaded selector is not fitted!")
|
| 222 |
+
print(f"✓ Feature selector loaded from {filepath}")
|
| 223 |
+
return selector
|