SalZa2004 commited on
Commit
315d4ad
·
1 Parent(s): b1a6659

updated applications

Browse files
Files changed (38) hide show
  1. applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc +0 -0
  2. applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc +0 -0
  3. applications/3_molecule_generator/__pycache__/main.cpython-310.pyc +0 -0
  4. applications/3_molecule_generator/__pycache__/results.cpython-310.pyc +0 -0
  5. applications/__init__.py +0 -0
  6. applications/docker/.dockerignore +0 -5
  7. applications/docker/Dockerfile +0 -33
  8. applications/docker/docker-compose.yml +0 -22
  9. applications/mixture_aware_generator/__init__.py +0 -0
  10. applications/mixture_predictor/__init__.py +0 -0
  11. applications/molecule_generator/__init__.py +0 -0
  12. applications/molecule_generator/cli.py +43 -0
  13. applications/molecule_generator/main.py +34 -0
  14. applications/molecule_generator/results.py +37 -0
  15. applications/molecule_generator/results/final_population.csv +6 -0
  16. applications/molecule_generator/results/pareto_front.csv +3 -0
  17. applications/pure_predictor/__init__.py +0 -0
  18. applications/pure_predictor/cli.py +29 -0
  19. applications/pure_predictor/main.py +82 -0
  20. applications/pure_predictor/results.py +11 -0
  21. core/__init__.py +0 -0
  22. core/__pycache__/config.cpython-310.pyc +0 -0
  23. core/blending/__init__.py +0 -0
  24. core/config.py +26 -0
  25. core/data_prep.py +34 -0
  26. core/evolution/__init__.py +0 -0
  27. core/evolution/__pycache__/evolution.cpython-310.pyc +0 -0
  28. core/evolution/evolution.py +234 -0
  29. core/evolution/molecule.py +33 -0
  30. core/evolution/population.py +86 -0
  31. core/predictors/__init__.py +0 -0
  32. core/predictors/mixture/__init__.py +0 -0
  33. core/predictors/pure_component/__pycache__/generic.cpython-310.pyc +0 -0
  34. core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc +0 -0
  35. core/predictors/pure_component/generic.py +51 -0
  36. core/predictors/pure_component/hf_models.py +17 -0
  37. core/predictors/pure_component/property_predictor.py +77 -0
  38. core/shared_features.py +223 -0
applications/3_molecule_generator/__pycache__/__init__.cpython-310.pyc DELETED
Binary file (143 Bytes)
 
applications/3_molecule_generator/__pycache__/cli.cpython-310.pyc DELETED
Binary file (1.76 kB)
 
applications/3_molecule_generator/__pycache__/main.cpython-310.pyc DELETED
Binary file (918 Bytes)
 
applications/3_molecule_generator/__pycache__/results.cpython-310.pyc DELETED
Binary file (1.85 kB)
 
applications/__init__.py ADDED
File without changes
applications/docker/.dockerignore DELETED
@@ -1,5 +0,0 @@
1
- venv*
2
- __pycache__/
3
- *.pyc
4
- .git/
5
- .gitignore
 
 
 
 
 
 
applications/docker/Dockerfile DELETED
@@ -1,33 +0,0 @@
1
- FROM python:3.10-slim
2
-
3
- # Avoid interactive prompts
4
- ENV DEBIAN_FRONTEND=noninteractive
5
-
6
- # System deps (important for RDKit / ML)
7
- RUN apt-get update && apt-get install -y \
8
- git \
9
- git-lfs \
10
- build-essential \
11
- sqlite3 \
12
- && rm -rf /var/lib/apt/lists/*
13
-
14
- # Install git-lfs
15
- RUN git lfs install
16
-
17
- # Set working directory
18
- WORKDIR /app
19
-
20
- # Copy dependency files first (better caching)
21
- COPY requirements.txt .
22
-
23
- RUN pip install --upgrade pip setuptools wheel \
24
- && pip install -r requirements.txt
25
-
26
- # Copy the rest of the project
27
- COPY . .
28
-
29
- # Editable install
30
- RUN pip install -e .
31
-
32
- # Default command (can override)
33
- CMD ["bash"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
applications/docker/docker-compose.yml DELETED
@@ -1,22 +0,0 @@
1
- services:
2
- biofuel-ml:
3
- build:
4
- context: ..
5
- dockerfile: docker/Dockerfile
6
- image: biofuel-ml:latest
7
- container_name: biofuel-ml
8
- tty: true
9
- stdin_open: true
10
-
11
- volumes:
12
- - ..:/app
13
- - ~/.cache/huggingface:/root/.cache/huggingface
14
-
15
- working_dir: /app
16
-
17
- environment:
18
- - PYTHONUNBUFFERED=1
19
- - HF_HOME=/root/.cache/huggingface
20
- - PYTHONHASHSEED=42
21
-
22
- command: bash
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
applications/mixture_aware_generator/__init__.py ADDED
File without changes
applications/mixture_predictor/__init__.py ADDED
File without changes
applications/molecule_generator/__init__.py ADDED
File without changes
applications/molecule_generator/cli.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from core.config import EvolutionConfig
2
+
3
+ def get_user_config() -> EvolutionConfig:
4
+ """Get configuration from user input."""
5
+ print("\n" + "="*70)
6
+ print("MOLECULAR EVOLUTION WITH GENETIC ALGORITHM")
7
+ print("="*70)
8
+
9
+ # Choose optimization mode
10
+ print("\nOptimization Mode:")
11
+ print("1. Target a specific CN value (minimize error from target)")
12
+ print("2. Maximize CN (find highest possible CN)")
13
+ mode = input("Select mode (1 or 2): ").strip()
14
+
15
+ maximize_cn = (mode == "2")
16
+ while mode not in ["1", "2"]:
17
+ print("Invalid selection. Please choose 1 or 2.")
18
+ mode = input("Select mode (1 or 2): ").strip()
19
+ maximize_cn = (mode == "2")
20
+ if maximize_cn:
21
+ print("\n✓ Mode: Maximize Cetane Number")
22
+ target = 100.0 # Dummy target, not used in maximize mode
23
+ else:
24
+ print("\n✓ Mode: Target Cetane Number")
25
+ while True:
26
+ target = float(input("Enter target CN: ") or "50")
27
+ if target > 40:
28
+ break
29
+ print("⚠️ Target CN is too low, optimization may be challenging.")
30
+ print("Consider using a higher target CN for better results.\n")
31
+
32
+ # Ask about YSI
33
+ minimize_ysi = input("\nMinimize YSI (y/n): ").strip().lower() in ['y', 'yes']
34
+
35
+ # Print configuration summary
36
+ print("\n" + "="*70)
37
+ print("CONFIGURATION SUMMARY:")
38
+ print(f" • Mode: {'Maximize CN' if maximize_cn else f'Target CN = {target}'}")
39
+ print(f" • Minimize YSI: {'Yes' if minimize_ysi else 'No'}")
40
+ print(f" • Optimization: {'Multi-objective (CN + YSI)' if minimize_ysi else 'Single-objective (CN only)'}")
41
+ print("="*70 + "\n")
42
+
43
+ return EvolutionConfig(target_cn=target, maximize_cn=maximize_cn, minimize_ysi=minimize_ysi)
applications/molecule_generator/main.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
3
+ os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
4
+ SEED = 42
5
+
6
+ import random, numpy as np
7
+
8
+ from .cli import get_user_config
9
+ from .results import display_results, save_results
10
+ from core.evolution.evolution import MolecularEvolution
11
+ from core.shared_features import FeatureSelector
12
+
13
+ os.environ["PYTHONHASHSEED"] = str(SEED)
14
+ random.seed(SEED)
15
+ np.random.seed(SEED)
16
+
17
+ def run(config):
18
+ evolution = MolecularEvolution(config)
19
+ return evolution.evolve()
20
+
21
+ def main():
22
+
23
+ """Main execution function."""
24
+ config = get_user_config()
25
+
26
+ evolution = MolecularEvolution(config)
27
+ final_df, pareto_df = evolution.evolve()
28
+
29
+ # Display and save results
30
+ display_results(final_df, pareto_df, config)
31
+ save_results(final_df, pareto_df, config.minimize_ysi)
32
+
33
+ if __name__ == "__main__":
34
+ main()
applications/molecule_generator/results.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ from pathlib import Path
3
+ import pandas as pd
4
+ from core.config import EvolutionConfig
5
+
6
+ def save_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, minimize_ysi: bool):
7
+ """Save results to CSV files."""
8
+ results_dir = Path("results")
9
+ results_dir.mkdir(exist_ok=True)
10
+
11
+ final_df.to_csv(results_dir / "final_population.csv", index=False)
12
+ if minimize_ysi and not pareto_df.empty:
13
+ pareto_df.to_csv(results_dir / "pareto_front.csv", index=False)
14
+
15
+ print("\n✓ Results saved to results/")
16
+
17
+
18
+ def display_results(final_df: pd.DataFrame, pareto_df: pd.DataFrame, config: EvolutionConfig):
19
+ """Display results to console."""
20
+ cols = ["rank", "smiles", "cn", "cn_error", "ysi", "bp", "density", "lhv", "dynamic_viscosity"]
21
+
22
+ if config.maximize_cn:
23
+ cols = [c for c in cols if c != "cn_error"]
24
+
25
+ available_cols = [c for c in cols if c in final_df.columns]
26
+
27
+ print("\n" + "="*70)
28
+ print("=== BEST CANDIDATES ===")
29
+ print("="*70)
30
+ print(final_df.head(10)[available_cols].to_string(index=False))
31
+
32
+ if config.minimize_ysi and not pareto_df.empty:
33
+ print("\n" + "="*70)
34
+ print("=== PARETO FRONT (Non-dominated solutions) ===")
35
+ print("="*70)
36
+ available_pareto_cols = [c for c in cols if c in pareto_df.columns]
37
+ print(pareto_df[available_pareto_cols].head(20).to_string(index=False))
applications/molecule_generator/results/final_population.csv ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ rank,smiles,cn,cn_error,cn_score,ysi
2
+ 1,CCCCCC(C)C(=O)O,55.49838163060194,0.49838163060194063,55.49838163060194,45.2672860656464
3
+ 2,CCCCC=O,55.79958905600905,0.7995890560090473,55.79958905600905,23.523889793928337
4
+ 3,CCCCC(CC)C(=O)O,57.088458252834485,2.088458252834485,57.088458252834485,44.53587682413441
5
+ 4,CCCCC(C)C(=O)O,57.32411875680274,2.32411875680274,57.32411875680274,36.7278812007473
6
+ 5,CCCCC(=O)O,57.959166968253996,2.959166968253996,57.959166968253996,24.625850459125378
applications/molecule_generator/results/pareto_front.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ rank,smiles,cn,cn_error,cn_score,ysi
2
+ 1,CCCCCC(C)C(=O)O,55.49838163060194,0.49838163060194063,55.49838163060194,45.2672860656464
3
+ 2,CCCCC=O,55.79958905600905,0.7995890560090473,55.79958905600905,23.523889793928337
applications/pure_predictor/__init__.py ADDED
File without changes
applications/pure_predictor/cli.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # applications/1_pure_predictor/cli.py
2
+
3
+ from rdkit import Chem
4
+ from rdkit.Chem import rdinchi
5
+
6
+
7
+ def get_user_config():
8
+ """
9
+ Collect user inputs for pure-component property prediction.
10
+ SMILES-only input.
11
+ """
12
+
13
+ mode = input("Select prediction mode (1: Single, 2: Batch): ").strip()
14
+ while mode not in {"1", "2"}:
15
+ print("Invalid selection. Please choose 1 or 2.")
16
+ mode = input("Select prediction mode (1: Single, 2: Batch): ").strip()
17
+
18
+ if mode == "1":
19
+ smiles = input("Enter SMILES string: ").strip()
20
+ if Chem.MolFromSmiles(smiles) is None:
21
+ raise ValueError("Invalid SMILES string.")
22
+ else:
23
+ smiles = input("Enter path to SMILES file: ").strip()
24
+
25
+ return {
26
+ "mode": mode,
27
+ "smiles": smiles
28
+ }
29
+
applications/pure_predictor/main.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # applications/1_pure_predictor/main.py
2
+ import os
3
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
4
+ os.environ["HF_HUB_DISABLE_TELEMETRY"] = "1"
5
+
6
+ from core.shared_features import featurize_df, FeatureSelector
7
+ from core.predictors.pure_component.generic import GenericPredictor
8
+ from core.predictors.pure_component.hf_models import load_models
9
+
10
+ from .cli import get_user_config
11
+ from .results import display_results
12
+
13
+ # Load model paths (local or HF)
14
+ PREDICTOR_PATHS = load_models()
15
+
16
+
17
+ def run(config):
18
+ """
19
+ Run pure-component property prediction.
20
+ """
21
+
22
+ smiles = config["smiles"]
23
+
24
+
25
+ # --- Featurize ONCE ---
26
+ X_full = featurize_df([smiles], return_df=False)
27
+
28
+ if X_full is None:
29
+ raise RuntimeError("Featurization failed for input SMILES.")
30
+
31
+ # --- Initialise predictors ---
32
+ cn_predictor = GenericPredictor(
33
+ PREDICTOR_PATHS["cn"],
34
+ "Cetane Number"
35
+ )
36
+
37
+ bp_predictor = GenericPredictor(
38
+ PREDICTOR_PATHS["bp"],
39
+ "Boiling Point"
40
+ )
41
+
42
+ density_predictor = GenericPredictor(
43
+ PREDICTOR_PATHS["density"],
44
+ "Density"
45
+ )
46
+
47
+ lhv_predictor = GenericPredictor(
48
+ PREDICTOR_PATHS["lhv"],
49
+ "Lower Heating Value"
50
+ )
51
+
52
+ dyn_visc_predictor = GenericPredictor(
53
+ PREDICTOR_PATHS["dynamic_viscosity"],
54
+ "Dynamic Viscosity"
55
+ )
56
+ ysi_predictor = GenericPredictor(
57
+ PREDICTOR_PATHS["ysi"],
58
+ "YSI"
59
+ )
60
+
61
+ # --- Predict ---
62
+ result = {
63
+ "SMILES": smiles,
64
+ "CN": cn_predictor.predict_from_features(X_full)[0],
65
+ "YSI": ysi_predictor.predict_from_features(X_full)[0],
66
+ "BOILING POINT": bp_predictor.predict_from_features(X_full)[0],
67
+ "DENSITY": density_predictor.predict_from_features(X_full)[0],
68
+ "LHV": lhv_predictor.predict_from_features(X_full)[0],
69
+ "DYNAMIC VISCOSITY": dyn_visc_predictor.predict_from_features(X_full)[0]
70
+ }
71
+
72
+ return result
73
+
74
+
75
+ def main():
76
+ config = get_user_config()
77
+ results = run(config)
78
+ display_results(results)
79
+
80
+
81
+ if __name__ == "__main__":
82
+ main()
applications/pure_predictor/results.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # applications/1_pure_predictor/results.py
2
+
3
+ import pandas as pd
4
+
5
+ def display_results(result: dict):
6
+ """
7
+ Display pure-component prediction results.
8
+ """
9
+ df = pd.DataFrame([result])
10
+ print("\n=== PURE COMPONENT PROPERTY PREDICTION ===\n")
11
+ print(df.to_string(index=False))
core/__init__.py ADDED
File without changes
core/__pycache__/config.cpython-310.pyc CHANGED
Binary files a/core/__pycache__/config.cpython-310.pyc and b/core/__pycache__/config.cpython-310.pyc differ
 
core/blending/__init__.py ADDED
File without changes
core/config.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, field
2
+
3
+ @dataclass
4
+ class EvolutionConfig:
5
+ target_cn: float = 50.0
6
+ maximize_cn: bool = False
7
+ minimize_ysi: bool = True
8
+
9
+ generations: int = 6
10
+ population_size: int = 100
11
+ mutations_per_parent: int = 5
12
+ survivor_fraction: float = 0.5
13
+
14
+ batch_size: int = 100
15
+ max_offspring_attempts: int = 10
16
+
17
+ # Filters
18
+ filters: dict = field(default_factory=lambda: {
19
+ "bp": (60.0, 250.0),
20
+ "density": (720.0, None),
21
+ "lhv": (30.0, None),
22
+ "dynamic_viscosity": (0.0, 2.0),
23
+ })
24
+
25
+ def cn_objective(self, cn: float) -> float:
26
+ return cn if self.maximize_cn else -abs(cn - self.target_cn)
core/data_prep.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ import pandas as pd
4
+ from sklearn.model_selection import train_test_split
5
+ import os
6
+
7
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__)) # goes from src/ → project root
8
+ DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
9
+
10
+ TARGET_CN = "cn" # Cetane number
11
+ N_FOLDS = 5
12
+ TOP_K = 5
13
+ print("Connecting to SQLite database...")
14
+ conn = sqlite3.connect(DB_PATH)
15
+
16
+ query = """
17
+ SELECT
18
+ F.Fuel_Name,
19
+ F.SMILES,
20
+ T.Standardised_DCN AS cn
21
+ FROM FUEL F
22
+ LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
23
+ ORDER BY F.fuel_id ASC;
24
+ """
25
+ df = pd.read_sql_query(query, conn)
26
+ conn.close()
27
+ df.dropna(subset=[TARGET_CN, "SMILES"], inplace=True)
28
+
29
+ train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
30
+ print(df.head())
31
+ print(df.columns)
32
+
33
+ def load_data():
34
+ return df
core/evolution/__init__.py ADDED
File without changes
core/evolution/__pycache__/evolution.cpython-310.pyc CHANGED
Binary files a/core/evolution/__pycache__/evolution.cpython-310.pyc and b/core/evolution/__pycache__/evolution.cpython-310.pyc differ
 
core/evolution/evolution.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .population import Population
2
+ from .molecule import Molecule
3
+ from core.predictors.pure_component.property_predictor import PropertyPredictor
4
+ from core.config import EvolutionConfig
5
+ from crem.crem import mutate_mol
6
+ from rdkit import Chem
7
+ import pandas as pd
8
+ import numpy as np
9
+ import random
10
+ from typing import List, Tuple
11
+ from core.data_prep import df # Initial dataset for sampling
12
+ from pathlib import Path
13
+
14
+ class MolecularEvolution:
15
+ """Main evolutionary algorithm coordinator."""
16
+ BASE_DIR = Path(__file__).resolve().parent.parent.parent
17
+ REP_DB_PATH = BASE_DIR / "data" / "fragments" / "diesel_fragments.db"
18
+
19
+ def __init__(self, config: EvolutionConfig):
20
+ self.config = config
21
+ self.predictor = PropertyPredictor(config)
22
+ self.population = Population(config)
23
+
24
+ def _mutate_molecule(self, mol: Chem.Mol) -> List[str]:
25
+ """Generate mutations for a molecule using CREM."""
26
+ try:
27
+ mutants = list(mutate_mol(
28
+ mol,
29
+ db_name=str(self.REP_DB_PATH),
30
+ max_size=2,
31
+ return_mol=False
32
+ ))
33
+ return [m for m in mutants if m and m not in self.population.seen_smiles]
34
+ except Exception:
35
+ return []
36
+
37
+ def _create_molecules(self, smiles_list: List[str]) -> List[Molecule]:
38
+ """Create Molecule objects from SMILES with predictions (OPTIMIZED)."""
39
+ if not smiles_list:
40
+ return []
41
+
42
+ # OPTIMIZATION: Single featurization + all predictions
43
+ predictions = self.predictor.predict_all_properties(smiles_list)
44
+
45
+ molecules = []
46
+ for i, smiles in enumerate(smiles_list):
47
+ # Extract predictions for this molecule
48
+ props = {k: v[i] for k, v in predictions.items()}
49
+
50
+ # Validate required properties
51
+ if props.get('cn') is None:
52
+ continue
53
+ if self.config.minimize_ysi and props.get('ysi') is None:
54
+ continue
55
+
56
+ # Validate filtered properties
57
+ if not all(self.predictor.is_valid(k, props.get(k)) for k in ['bp', 'density', 'lhv', 'dynamic_viscosity']):
58
+ continue
59
+
60
+ molecules.append(Molecule(
61
+ smiles=smiles,
62
+ cn=props['cn'],
63
+ cn_error=abs(props['cn'] - self.config.target_cn),
64
+ cn_score=props['cn'], # For maximize mode
65
+ bp=props.get('bp'),
66
+ ysi=props.get('ysi'),
67
+ density=props.get('density'),
68
+ lhv=props.get('lhv'),
69
+ dynamic_viscosity=props.get('dynamic_viscosity')
70
+ ))
71
+
72
+ return molecules
73
+
74
+ def initialize_population(self, initial_smiles: List[str]) -> int:
75
+ """Initialize the population from initial SMILES."""
76
+ print("Predicting properties for initial population...")
77
+ molecules = self._create_molecules(initial_smiles)
78
+ return self.population.add_molecules(molecules)
79
+
80
+ def _log_generation_stats(self, generation: int):
81
+ """Log statistics for the current generation."""
82
+ mols = self.population.molecules
83
+
84
+ if self.config.maximize_cn:
85
+ best_cn = max(mols, key=lambda m: m.cn)
86
+ avg_cn = np.mean([m.cn for m in mols])
87
+
88
+ print_msg = (f"Gen {generation}/{self.config.generations} | "
89
+ f"Pop {len(mols)} | "
90
+ f"Best CN: {best_cn.cn:.3f} | "
91
+ f"Avg CN: {avg_cn:.3f}")
92
+ else:
93
+ best_cn = min(mols, key=lambda m: m.cn_error)
94
+ avg_cn_err = np.mean([m.cn_error for m in mols])
95
+
96
+ print_msg = (f"Gen {generation}/{self.config.generations} | "
97
+ f"Pop {len(mols)} | "
98
+ f"Best CN err: {best_cn.cn_error:.3f} | "
99
+ f"Avg CN err: {avg_cn_err:.3f}")
100
+
101
+ if self.config.minimize_ysi:
102
+ front = self.population.pareto_front()
103
+ best_ysi = min(mols, key=lambda m: m.ysi)
104
+ avg_ysi = np.mean([m.ysi for m in mols])
105
+
106
+ print_msg += (f" | Best YSI: {best_ysi.ysi:.3f} | "
107
+ f"Avg YSI: {avg_ysi:.3f} | "
108
+ f"Pareto: {len(front)}")
109
+
110
+ print(print_msg)
111
+
112
+ def _generate_offspring(self, survivors: List[Molecule]) -> List[Molecule]:
113
+ """Generates offspring from survivors."""
114
+ target_count = self.config.population_size - len(survivors)
115
+ max_attempts = target_count * self.config.max_offspring_attempts
116
+
117
+ all_children = []
118
+ new_molecules = []
119
+
120
+ print(f" → Generating offspring (target: {target_count})...")
121
+
122
+ for attempt in range(max_attempts):
123
+ if len(new_molecules) >= target_count:
124
+ break
125
+
126
+ # Generate mutations
127
+ parent = random.choice(survivors)
128
+ mol = Chem.MolFromSmiles(parent.smiles)
129
+ if mol is None:
130
+ continue
131
+
132
+ children = self._mutate_molecule(mol)
133
+ all_children.extend(children[:self.config.mutations_per_parent])
134
+
135
+ # Process in larger batches (single featurization per batch)
136
+ if len(all_children) >= self.config.batch_size:
137
+ print(f" → Evaluating batch of {len(all_children)} (featurizing once)...")
138
+ new_molecules.extend(self._create_molecules(all_children))
139
+ all_children = []
140
+
141
+ # Process remaining children
142
+ if all_children:
143
+ print(f" → Evaluating final batch of {len(all_children)}...")
144
+ new_molecules.extend(self._create_molecules(all_children))
145
+
146
+ print(f" ✓ Generated {len(new_molecules)} valid offspring")
147
+ return new_molecules
148
+
149
+ def _run_evolution_loop(self):
150
+ """Run the main evolution loop."""
151
+ for gen in range(1, self.config.generations + 1):
152
+ self._log_generation_stats(gen)
153
+
154
+ survivors = self.population.get_survivors()
155
+ offspring = self._generate_offspring(survivors)
156
+
157
+ # Create new population
158
+ new_pop = Population(self.config)
159
+ new_pop.add_molecules(survivors + offspring)
160
+ self.population = new_pop
161
+
162
+ def _generate_results(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
163
+ """Generate final results DataFrames."""
164
+ final_df = self.population.to_dataframe()
165
+
166
+ # Apply different filtering based on mode
167
+ if self.config.maximize_cn:
168
+ if self.config.minimize_ysi and "ysi" in final_df.columns:
169
+ # Maximize CN + minimize YSI: keep high CN, low YSI
170
+ final_df = final_df[
171
+ (final_df["cn"] > 50) &
172
+ (final_df["ysi"] < 50)
173
+ ].sort_values(["cn", "ysi"], ascending=[False, True])
174
+ else:
175
+ # Maximize CN only: just keep high CN
176
+ final_df = final_df[final_df["cn"] > 50].sort_values("cn", ascending=False)
177
+ else:
178
+ if self.config.minimize_ysi and "ysi" in final_df.columns:
179
+ # Target CN + minimize YSI: keep low error, low YSI
180
+ final_df = final_df[
181
+ (final_df["cn_error"] < 5) &
182
+ (final_df["ysi"] < 50)
183
+ ].sort_values(["cn_error", "ysi"], ascending=True)
184
+ else:
185
+ # Target CN only: just keep low error
186
+ final_df = final_df[final_df["cn_error"] < 5].sort_values("cn_error", ascending=True)
187
+
188
+ # Overwrite rank safely
189
+ final_df["rank"] = range(1, len(final_df) + 1)
190
+
191
+ if self.config.minimize_ysi:
192
+ pareto_mols = self.population.pareto_front()
193
+ pareto_df = pd.DataFrame([m.to_dict() for m in pareto_mols])
194
+
195
+ if not pareto_df.empty:
196
+ if self.config.maximize_cn:
197
+ pareto_df = pareto_df[
198
+ (pareto_df['cn'] > 50) & (pareto_df['ysi'] < 50)
199
+ ].sort_values(["cn", "ysi"], ascending=[False, True])
200
+ else:
201
+ pareto_df = pareto_df[
202
+ (pareto_df['cn_error'] < 5) & (pareto_df['ysi'] < 50)
203
+ ].sort_values(["cn_error", "ysi"], ascending=True)
204
+
205
+ pareto_df.insert(0, 'rank', range(1, len(pareto_df) + 1))
206
+ else:
207
+ pareto_df = pd.DataFrame()
208
+
209
+ return final_df, pareto_df
210
+
211
+
212
+ def evolve(self) -> Tuple[pd.DataFrame, pd.DataFrame]:
213
+ """Run the evolutionary algorithm."""
214
+ # Initialize
215
+ df_bins = pd.qcut(df["cn"], q=30)
216
+ initial_smiles = (
217
+ df.groupby(df_bins, observed=False)
218
+ .apply(lambda x: x.sample(20, random_state=42))
219
+ .reset_index(drop=True)["SMILES"]
220
+ .tolist()
221
+ )
222
+ init_count = self.initialize_population(initial_smiles)
223
+
224
+ if init_count == 0:
225
+ print("No valid initial molecules")
226
+ return pd.DataFrame(), pd.DataFrame()
227
+
228
+ print(f"✓ Initial population size: {init_count}\n")
229
+
230
+ # Evolution
231
+ self._run_evolution_loop()
232
+
233
+ # Results
234
+ return self._generate_results()
core/evolution/molecule.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass, asdict
2
+ from typing import Optional, Dict
3
+
4
+ @dataclass
5
+ class Molecule:
6
+ """Represents a molecule with its properties."""
7
+ smiles: str
8
+ cn: float
9
+ cn_error: float
10
+ cn_score: float = 0.0 # For maximize mode (higher is better)
11
+ bp: Optional[float] = None
12
+ ysi: Optional[float] = None
13
+ density: Optional[float] = None
14
+ lhv: Optional[float] = None
15
+ dynamic_viscosity: Optional[float] = None
16
+
17
+ def dominates(self, other: 'Molecule', maximize_cn: bool = False) -> bool:
18
+ """Check if this molecule Pareto-dominates another."""
19
+ if maximize_cn:
20
+ # For maximize mode: higher CN is better
21
+ better_cn = self.cn >= other.cn
22
+ strictly_better_cn = self.cn > other.cn
23
+ else:
24
+ better_cn = self.cn_error <= other.cn_error
25
+ strictly_better_cn = self.cn_error < other.cn_error
26
+ better_ysi = self.ysi <= other.ysi if self.ysi is not None else True
27
+ strictly_better_ysi = self.ysi < other.ysi if self.ysi is not None else False
28
+ strictly_better = strictly_better_cn or strictly_better_ysi
29
+ return better_cn and better_ysi and strictly_better
30
+
31
+ def to_dict(self) -> Dict:
32
+ """Convert to dictionary for DataFrame creation."""
33
+ return {k: v for k, v in asdict(self).items() if v is not None}
core/evolution/population.py ADDED
@@ -0,0 +1,86 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import List
2
+ from core.config import EvolutionConfig
3
+ from .molecule import Molecule
4
+ import pandas as pd
5
+
6
+ class Population:
7
+ """Manages the population of molecules."""
8
+ def __init__(self, config: EvolutionConfig):
9
+ self.config = config
10
+ self.molecules: List[Molecule] = []
11
+ self.seen_smiles: set = set()
12
+
13
+ def add_molecule(self, mol: Molecule) -> bool:
14
+ """Add a molecule if it's not already in the population."""
15
+ if mol.smiles in self.seen_smiles:
16
+ return False
17
+ self.molecules.append(mol)
18
+ self.seen_smiles.add(mol.smiles)
19
+ return True
20
+
21
+ def add_molecules(self, molecules: List[Molecule]) -> int:
22
+ """Add multiple molecules, return count added."""
23
+ return sum(self.add_molecule(mol) for mol in molecules)
24
+
25
+ def pareto_front(self) -> List[Molecule]:
26
+ """Extract the Pareto front from the population."""
27
+ if not self.config.minimize_ysi:
28
+ return []
29
+
30
+ return [
31
+ mol for mol in self.molecules
32
+ if not any(other.dominates(mol, self.config.maximize_cn)
33
+ for other in self.molecules if other is not mol)
34
+ ]
35
+
36
+ def get_survivors(self) -> List[Molecule]:
37
+ """Select survivors for the next generation."""
38
+ target_size = int(self.config.population_size * self.config.survivor_fraction)
39
+
40
+ if self.config.minimize_ysi:
41
+ survivors = self.pareto_front()
42
+
43
+ sort_key = lambda m: (
44
+ -self.config.cn_objective(m.cn), # higher objective = better
45
+ m.ysi
46
+ )
47
+
48
+
49
+ if len(survivors) > target_size:
50
+ survivors = sorted(survivors, key=sort_key)[:target_size]
51
+ elif len(survivors) < target_size:
52
+ remainder = [m for m in self.molecules if m not in survivors]
53
+ remainder = sorted(remainder, key=sort_key)
54
+ survivors.extend(remainder[:target_size - len(survivors)])
55
+ else:
56
+ # Single objective mode
57
+ survivors = sorted(
58
+ self.molecules,
59
+ key=lambda m: self.config.cn_objective(m.cn),
60
+ reverse=True
61
+ )[:target_size]
62
+
63
+ return survivors
64
+
65
+ def to_dataframe(self) -> pd.DataFrame:
66
+ """Convert population to DataFrame."""
67
+ df = pd.DataFrame([m.to_dict() for m in self.molecules])
68
+
69
+ if self.config.maximize_cn:
70
+ if self.config.minimize_ysi:
71
+ sort_cols = ["cn", "ysi"]
72
+ ascending = [False, True] # Descending CN, ascending YSI
73
+ else:
74
+ sort_cols = ["cn"]
75
+ ascending = False
76
+ else:
77
+ if self.config.minimize_ysi:
78
+ sort_cols = ["cn_error", "ysi"]
79
+ ascending = True
80
+ else:
81
+ sort_cols = ["cn_error"]
82
+ ascending = True
83
+
84
+ df = df.sort_values(sort_cols, ascending=ascending)
85
+ df.insert(0, 'rank', range(1, len(df) + 1))
86
+ return df
core/predictors/__init__.py ADDED
File without changes
core/predictors/mixture/__init__.py ADDED
File without changes
core/predictors/pure_component/__pycache__/generic.cpython-310.pyc CHANGED
Binary files a/core/predictors/pure_component/__pycache__/generic.cpython-310.pyc and b/core/predictors/pure_component/__pycache__/generic.cpython-310.pyc differ
 
core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc CHANGED
Binary files a/core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc and b/core/predictors/pure_component/__pycache__/property_predictor.cpython-310.pyc differ
 
core/predictors/pure_component/generic.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import joblib
2
+ import numpy as np
3
+
4
+ from pathlib import Path
5
+
6
+ # Make FeatureSelector discoverable for joblib
7
+ import sys
8
+ from core.shared_features import FeatureSelector
9
+
10
+ # --- FIX FOR JOBLIB / PICKLE ---
11
+ # Models were trained with FeatureSelector in __main__
12
+ # Ensure pickle can resolve it in all contexts (pytest, HF, Docker)
13
+ main_module = sys.modules.get("__main__")
14
+ if main_module is not None and not hasattr(main_module, "FeatureSelector"):
15
+ setattr(main_module, "FeatureSelector", FeatureSelector)
16
+
17
+ class GenericPredictor:
18
+ """Generic predictor that works for any property model."""
19
+
20
+ def __init__(self, model_dir: Path, property_name: str):
21
+ """
22
+ Initialize predictor from a model directory.
23
+
24
+ Args:
25
+ model_dir: Path to the model directory containing artifacts/
26
+ property_name: Name of the property (for display purposes)
27
+ """
28
+ print(f"Loading {property_name} Predictor...")
29
+
30
+ model_path = model_dir / "model.joblib"
31
+ selector_path = model_dir / "selector.joblib"
32
+
33
+ # Load artifacts
34
+ self.model = joblib.load(model_path)
35
+ self.selector = FeatureSelector.load(selector_path)
36
+ self.property_name = property_name
37
+
38
+ print(f"✓ {property_name} Predictor ready!")
39
+
40
+ def predict_from_features(self, X_full):
41
+ """Predict from pre-computed features."""
42
+ if X_full is None or len(X_full) == 0:
43
+ return []
44
+
45
+ try:
46
+ X_selected = self.selector.transform(X_full)
47
+ predictions = self.model.predict(X_selected)
48
+ return predictions.tolist()
49
+ except Exception as e:
50
+ print(f"⚠ Warning: {self.property_name} prediction failed: {e}")
51
+ return [None] * len(X_full)
core/predictors/pure_component/hf_models.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ from huggingface_hub import snapshot_download
3
+
4
+ HF_MODELS = {
5
+ "cn": "SalZa2004/Cetane_Number_Predictor",
6
+ "ysi": "SalZa2004/YSI_Predictor",
7
+ "bp": "SalZa2004/Boiling_Point_Predictor",
8
+ "density": "SalZa2004/Density_Predictor",
9
+ "lhv": "SalZa2004/LHV_Predictor",
10
+ "dynamic_viscosity": "SalZa2004/Dynamic_Viscosity_Predictor",
11
+ }
12
+
13
+ def load_models():
14
+ return {
15
+ k: Path(snapshot_download(repo_id=v, repo_type="model"))
16
+ for k, v in HF_MODELS.items()
17
+ }
core/predictors/pure_component/property_predictor.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from .generic import GenericPredictor
3
+ from core.shared_features import featurize_df
4
+ from core.config import EvolutionConfig
5
+ from typing import List, Dict, Optional, Tuple, Callable
6
+ from .hf_models import load_models
7
+
8
+ PREDICTOR_PATHS = load_models()
9
+ class PropertyPredictor:
10
+ """Handles batch prediction for all molecular properties."""
11
+
12
+ def __init__(self, config: EvolutionConfig):
13
+ self.config = config
14
+
15
+ # Initialize only the predictors we need
16
+ self.predictors = {}
17
+
18
+ # Always need CN predictor
19
+ self.predictors['cn'] = GenericPredictor(
20
+ PREDICTOR_PATHS['cn'],
21
+ 'Cetane Number'
22
+ )
23
+
24
+ # Conditional predictors
25
+ if config.minimize_ysi:
26
+ self.predictors['ysi'] = GenericPredictor(
27
+ PREDICTOR_PATHS['ysi'],
28
+ 'YSI'
29
+ )
30
+
31
+
32
+ # Define validation rules
33
+ self.validators = {
34
+ 'bp': lambda v: self.config.min_bp <= v <= self.config.max_bp,
35
+ 'density': lambda v: v > self.config.min_density,
36
+ 'lhv': lambda v: v > self.config.min_lhv,
37
+ 'dynamic_viscosity': lambda v: self.config.min_dynamic_viscosity < v <= self.config.max_dynamic_viscosity
38
+ }
39
+
40
+ def _safe_predict(self, predictions: List) -> List[Optional[float]]:
41
+ """Safely convert predictions, handling None/NaN/inf values."""
42
+ return [
43
+ float(pred) if pred is not None and np.isfinite(pred) else None
44
+ for pred in predictions
45
+ ]
46
+
47
+ def predict_all_properties(self, smiles_list: List[str]) -> Dict[str, List[Optional[float]]]:
48
+ """
49
+ Predict all properties for a batch of SMILES.
50
+ Featurizes ONCE and reuses features for all predictors.
51
+ """
52
+ if not smiles_list:
53
+ return {prop: [] for prop in self.predictors.keys()}
54
+
55
+ # OPTIMIZATION: Featurize only once per batch
56
+ X_full = featurize_df(smiles_list, return_df=False)
57
+
58
+ if X_full is None:
59
+ return {prop: [None] * len(smiles_list) for prop in self.predictors.keys()}
60
+
61
+ # Predict all properties using the same features
62
+ results = {}
63
+ for prop_name, predictor in self.predictors.items():
64
+ predictions = predictor.predict_from_features(X_full)
65
+ results[prop_name] = self._safe_predict(predictions)
66
+
67
+ return results
68
+
69
+ def is_valid(self, name, value):
70
+ if value is None or name not in self.config.filters:
71
+ return True
72
+ lo, hi = self.config.filters[name]
73
+ if lo is not None and value < lo:
74
+ return False
75
+ if hi is not None and value > hi:
76
+ return False
77
+ return True
core/shared_features.py ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sqlite3
3
+ import pandas as pd
4
+ import numpy as np
5
+ from sklearn.model_selection import train_test_split
6
+
7
+ PROJECT_ROOT = os.path.dirname(os.path.dirname(__file__))
8
+ DB_PATH = os.path.join(PROJECT_ROOT, "data", "database", "database_main.db")
9
+
10
+ def load_raw_data():
11
+ """Load raw data from database."""
12
+ print("Connecting to SQLite database...")
13
+ conn = sqlite3.connect(DB_PATH)
14
+
15
+ query = """
16
+ SELECT
17
+ F.Fuel_Name,
18
+ F.SMILES,
19
+ T.Standardised_DCN AS cn
20
+ FROM FUEL F
21
+ LEFT JOIN TARGET T ON F.fuel_id = T.fuel_id
22
+ """
23
+ df = pd.read_sql_query(query, conn)
24
+ conn.close()
25
+
26
+ # Clean data
27
+ df.dropna(subset=["cn", "SMILES"], inplace=True)
28
+
29
+ return df
30
+
31
+
32
+ # ============================================================================
33
+ # 2. FEATURIZATION MODULE
34
+ # ============================================================================
35
+ from rdkit import Chem
36
+ from rdkit.Chem import Descriptors, rdFingerprintGenerator
37
+ from tqdm import tqdm
38
+
39
+ # Get descriptor names globally
40
+ DESCRIPTOR_NAMES = [d[0] for d in Descriptors._descList]
41
+ desc_functions = [d[1] for d in Descriptors._descList]
42
+
43
+ def morgan_fp_from_mol(mol, radius=2, n_bits=2048):
44
+ """Generate Morgan fingerprint."""
45
+ fpgen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=n_bits)
46
+ fp = fpgen.GetFingerprint(mol)
47
+ arr = np.array(list(fp.ToBitString()), dtype=int)
48
+ return arr
49
+
50
+ def physchem_desc_from_mol(mol):
51
+ """Calculate physicochemical descriptors."""
52
+ try:
53
+ desc = np.array([fn(mol) for fn in desc_functions], dtype=np.float32)
54
+ desc = np.nan_to_num(desc, nan=0.0, posinf=0.0, neginf=0.0)
55
+ return desc
56
+ except:
57
+ return None
58
+
59
+ def featurize(smiles):
60
+ """Convert SMILES to feature vector."""
61
+ mol = Chem.MolFromSmiles(smiles)
62
+ if mol is None:
63
+ return None
64
+
65
+ fp = morgan_fp_from_mol(mol)
66
+ desc = physchem_desc_from_mol(mol)
67
+
68
+ if fp is None or desc is None:
69
+ return None
70
+
71
+ return np.hstack([fp, desc])
72
+
73
+ def featurize_df(df, smiles_col="SMILES", return_df=True):
74
+ """
75
+ Featurize a DataFrame or list of SMILES (vectorized for speed).
76
+ """
77
+ # Handle different input types
78
+ if isinstance(df, (list, np.ndarray)):
79
+ df = pd.DataFrame({smiles_col: df})
80
+ elif isinstance(df, pd.Series):
81
+ df = pd.DataFrame({smiles_col: df})
82
+
83
+ # Convert all SMILES to molecules in batch
84
+ mols = [Chem.MolFromSmiles(smi) for smi in df[smiles_col]]
85
+
86
+ features = []
87
+ valid_indices = []
88
+
89
+ # Process valid molecules
90
+ for i, mol in enumerate(tqdm(mols, desc="Featurizing")):
91
+ if mol is None:
92
+ continue
93
+
94
+ try:
95
+ fp = morgan_fp_from_mol(mol)
96
+ desc = physchem_desc_from_mol(mol)
97
+
98
+ if fp is not None and desc is not None:
99
+ features.append(np.hstack([fp, desc]))
100
+ valid_indices.append(i)
101
+ except:
102
+ continue
103
+
104
+ if len(features) == 0:
105
+ return (None, None) if return_df else None
106
+
107
+ X = np.vstack(features)
108
+
109
+ if return_df:
110
+ df_valid = df.iloc[valid_indices].reset_index(drop=True)
111
+ return X, df_valid
112
+ else:
113
+ return X
114
+
115
+
116
+ # ============================================================================
117
+ # 3. FEATURE SELECTOR CLASS
118
+ # ============================================================================
119
+ import joblib
120
+
121
+ class FeatureSelector:
122
+ """Feature selection pipeline that can be saved and reused."""
123
+
124
+ def __init__(self, n_morgan=2048, corr_threshold=0.95, top_k=300):
125
+ self.n_morgan = n_morgan
126
+ self.corr_threshold = corr_threshold
127
+ self.top_k = top_k
128
+
129
+ # Filled during fit()
130
+ self.corr_cols_to_drop = None
131
+ self.selected_indices = None
132
+ self.is_fitted = False
133
+
134
+ def fit(self, X, y):
135
+ """Fit the feature selector on training data."""
136
+ print("\n" + "="*70)
137
+ print("FITTING FEATURE SELECTOR")
138
+ print("="*70)
139
+
140
+ # Step 1: Split Morgan and descriptors
141
+ X_mfp = X[:, :self.n_morgan]
142
+ X_desc = X[:, self.n_morgan:]
143
+
144
+ print(f"Morgan fingerprints: {X_mfp.shape[1]}")
145
+ print(f"Descriptors: {X_desc.shape[1]}")
146
+
147
+ # Step 2: Remove correlated descriptors
148
+ desc_df = pd.DataFrame(X_desc)
149
+ corr_matrix = desc_df.corr().abs()
150
+ upper = corr_matrix.where(
151
+ np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
152
+ )
153
+
154
+ self.corr_cols_to_drop = [
155
+ col for col in upper.columns if any(upper[col] > self.corr_threshold)
156
+ ]
157
+
158
+ print(f"Correlated descriptors removed: {len(self.corr_cols_to_drop)}")
159
+
160
+ desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
161
+ X_corr = np.hstack([X_mfp, desc_filtered])
162
+
163
+ print(f"Features after correlation filter: {X_corr.shape[1]}")
164
+
165
+ # Step 3: Feature importance selection
166
+ from sklearn.ensemble import ExtraTreesRegressor
167
+
168
+ print("Running feature importance selection...")
169
+ model = ExtraTreesRegressor(n_estimators=100, random_state=42, n_jobs=-1)
170
+ model.fit(X_corr, y)
171
+
172
+ importances = model.feature_importances_
173
+ indices = np.argsort(importances)[::-1]
174
+
175
+ self.selected_indices = indices[:self.top_k]
176
+
177
+ print(f"Final selected features: {len(self.selected_indices)}")
178
+
179
+ self.is_fitted = True
180
+ return self
181
+
182
+ def transform(self, X):
183
+ """Apply the fitted feature selection to new data."""
184
+ if not self.is_fitted:
185
+ raise RuntimeError("FeatureSelector must be fitted before transform!")
186
+
187
+ # Step 1: Split Morgan and descriptors
188
+ X_mfp = X[:, :self.n_morgan]
189
+ X_desc = X[:, self.n_morgan:]
190
+
191
+ # Step 2: Remove same correlated descriptors
192
+ desc_df = pd.DataFrame(X_desc)
193
+ desc_filtered = desc_df.drop(columns=self.corr_cols_to_drop, axis=1).values
194
+ X_corr = np.hstack([X_mfp, desc_filtered])
195
+
196
+ # Step 3: Select same important features
197
+ X_selected = X_corr[:, self.selected_indices]
198
+
199
+ return X_selected
200
+
201
+ def fit_transform(self, X, y):
202
+ """Fit and transform in one step."""
203
+ return self.fit(X, y).transform(X)
204
+
205
+ def save(self, filepath='feature_selector.joblib'):
206
+ """Save the fitted selector."""
207
+ if not self.is_fitted:
208
+ raise RuntimeError("Cannot save unfitted selector!")
209
+
210
+ # Create directory if it doesn't exist
211
+ os.makedirs(os.path.dirname(filepath) if os.path.dirname(filepath) else '.', exist_ok=True)
212
+
213
+ joblib.dump(self, filepath)
214
+ print(f"✓ Feature selector saved to {filepath}")
215
+
216
+ @staticmethod
217
+ def load(filepath='feature_selector.joblib'):
218
+ """Load a fitted selector."""
219
+ selector = joblib.load(filepath)
220
+ if not selector.is_fitted:
221
+ raise RuntimeError("Loaded selector is not fitted!")
222
+ print(f"✓ Feature selector loaded from {filepath}")
223
+ return selector